summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dom/plugins/ipc/PluginInstanceParent.cpp1
-rw-r--r--dom/plugins/ipc/PluginInstanceParent.h5
-rwxr-xr-xintl/unicharutil/tools/genUnicodePropertyData.pl472
-rw-r--r--intl/unicharutil/util/nsUnicodeProperties.h1
-rw-r--r--intl/unicharutil/util/nsUnicodePropertyData.cpp96
-rw-r--r--intl/unicharutil/util/nsUnicodeScriptCodes.h48
-rw-r--r--ipc/chromium/src/base/condition_variable_posix.cc3
-rw-r--r--media/libjpeg/1050342.diff121
-rw-r--r--media/libjpeg/ChangeLog.md1839
-rw-r--r--media/libjpeg/LICENSE.md54
-rw-r--r--media/libjpeg/MOZCHANGES28
-rw-r--r--media/libjpeg/README.ijg99
-rw-r--r--[-rwxr-xr-x]media/libjpeg/README.md100
-rw-r--r--media/libjpeg/jaricom.c13
-rw-r--r--media/libjpeg/jcapimin.c50
-rw-r--r--media/libjpeg/jcapistd.c26
-rw-r--r--media/libjpeg/jcarith.c114
-rw-r--r--media/libjpeg/jccoefct.c102
-rw-r--r--media/libjpeg/jccolext.c64
-rw-r--r--media/libjpeg/jccolor.c389
-rw-r--r--media/libjpeg/jcdctmgr.c157
-rw-r--r--media/libjpeg/jchuff.c620
-rw-r--r--media/libjpeg/jchuff.h13
-rw-r--r--media/libjpeg/jcicc.c105
-rw-r--r--media/libjpeg/jcinit.c15
-rw-r--r--media/libjpeg/jcmainct.c40
-rw-r--r--media/libjpeg/jcmarker.c93
-rw-r--r--media/libjpeg/jcmaster.c137
-rw-r--r--media/libjpeg/jcomapi.c12
-rw-r--r--media/libjpeg/jconfigint.h46
-rw-r--r--media/libjpeg/jcparam.c87
-rw-r--r--media/libjpeg/jcphuff.c688
-rw-r--r--media/libjpeg/jcprepct.c100
-rw-r--r--media/libjpeg/jcsample.c173
-rw-r--r--media/libjpeg/jctrans.c77
-rw-r--r--media/libjpeg/jdapimin.c51
-rw-r--r--media/libjpeg/jdapistd.c194
-rw-r--r--media/libjpeg/jdarith.c156
-rw-r--r--media/libjpeg/jdatadst.c62
-rw-r--r--media/libjpeg/jdatasrc.c56
-rw-r--r--media/libjpeg/jdcoefct.c505
-rw-r--r--media/libjpeg/jdcoefct.h9
-rw-r--r--media/libjpeg/jdcol565.c184
-rw-r--r--media/libjpeg/jdcolext.c42
-rw-r--r--media/libjpeg/jdcolor.c493
-rw-r--r--media/libjpeg/jdct.h148
-rw-r--r--media/libjpeg/jddctmgr.c36
-rw-r--r--media/libjpeg/jdhuff.c223
-rw-r--r--media/libjpeg/jdhuff.h109
-rw-r--r--media/libjpeg/jdicc.c167
-rw-r--r--media/libjpeg/jdinput.c101
-rw-r--r--media/libjpeg/jdmainct.c123
-rw-r--r--media/libjpeg/jdmainct.h12
-rw-r--r--media/libjpeg/jdmarker.c357
-rw-r--r--media/libjpeg/jdmaster.c190
-rw-r--r--media/libjpeg/jdmerge.c407
-rw-r--r--media/libjpeg/jdmerge.h47
-rw-r--r--media/libjpeg/jdmrg565.c202
-rw-r--r--media/libjpeg/jdmrgext.c86
-rw-r--r--media/libjpeg/jdphuff.c179
-rw-r--r--media/libjpeg/jdpostct.c122
-rw-r--r--media/libjpeg/jdsample.c169
-rw-r--r--media/libjpeg/jdtrans.c19
-rw-r--r--media/libjpeg/jerror.c32
-rw-r--r--media/libjpeg/jerror.h116
-rw-r--r--media/libjpeg/jfdctflt.c58
-rw-r--r--media/libjpeg/jfdctfst.c50
-rw-r--r--media/libjpeg/jfdctint.c122
-rw-r--r--media/libjpeg/jidctflt.c112
-rw-r--r--media/libjpeg/jidctfst.c156
-rw-r--r--media/libjpeg/jidctint.c1734
-rw-r--r--media/libjpeg/jidctred.c256
-rw-r--r--media/libjpeg/jinclude.h129
-rw-r--r--media/libjpeg/jmemmgr.c320
-rw-r--r--media/libjpeg/jmemnobs.c45
-rw-r--r--media/libjpeg/jmemsys.h28
-rw-r--r--media/libjpeg/jmorecfg.h107
-rw-r--r--media/libjpeg/jpegcomp.h35
-rw-r--r--media/libjpeg/jpegint.h159
-rw-r--r--media/libjpeg/jpeglib.h216
-rw-r--r--media/libjpeg/jquant1.c205
-rw-r--r--media/libjpeg/jquant2.c401
-rw-r--r--media/libjpeg/jsimd.h188
-rw-r--r--media/libjpeg/jsimd_none.c257
-rw-r--r--media/libjpeg/jsimddct.h104
-rw-r--r--media/libjpeg/jstdhuff.c135
-rw-r--r--media/libjpeg/jutils.c30
-rw-r--r--media/libjpeg/jversion.h35
-rw-r--r--media/libjpeg/moz.build227
-rw-r--r--media/libjpeg/mozilla.diff67
-rw-r--r--media/libjpeg/simd/arm/aarch32/jccolext-neon.c148
-rw-r--r--media/libjpeg/simd/arm/aarch32/jchuff-neon.c334
-rw-r--r--media/libjpeg/simd/arm/aarch32/jsimd.c980
-rw-r--r--media/libjpeg/simd/arm/aarch32/jsimd_neon.S1200
-rw-r--r--media/libjpeg/simd/arm/aarch64/jccolext-neon.c316
-rw-r--r--media/libjpeg/simd/arm/aarch64/jchuff-neon.c411
-rw-r--r--media/libjpeg/simd/arm/aarch64/jsimd.c1058
-rw-r--r--media/libjpeg/simd/arm/aarch64/jsimd_neon.S (renamed from media/libjpeg/simd/jsimd_arm64_neon.S)1835
-rw-r--r--media/libjpeg/simd/arm/align.h28
-rw-r--r--media/libjpeg/simd/arm/jccolor-neon.c160
-rw-r--r--media/libjpeg/simd/arm/jcgray-neon.c120
-rw-r--r--media/libjpeg/simd/arm/jcgryext-neon.c106
-rw-r--r--media/libjpeg/simd/arm/jchuff.h131
-rw-r--r--media/libjpeg/simd/arm/jcphuff-neon.c622
-rw-r--r--media/libjpeg/simd/arm/jcsample-neon.c192
-rw-r--r--media/libjpeg/simd/arm/jdcolext-neon.c374
-rw-r--r--media/libjpeg/simd/arm/jdcolor-neon.c142
-rw-r--r--media/libjpeg/simd/arm/jdmerge-neon.c145
-rw-r--r--media/libjpeg/simd/arm/jdmrgext-neon.c723
-rw-r--r--media/libjpeg/simd/arm/jdsample-neon.c569
-rw-r--r--media/libjpeg/simd/arm/jfdctfst-neon.c214
-rw-r--r--media/libjpeg/simd/arm/jfdctint-neon.c376
-rw-r--r--media/libjpeg/simd/arm/jidctfst-neon.c472
-rw-r--r--media/libjpeg/simd/arm/jidctint-neon.c802
-rw-r--r--media/libjpeg/simd/arm/jidctred-neon.c486
-rw-r--r--media/libjpeg/simd/arm/jquanti-neon.c193
-rw-r--r--media/libjpeg/simd/arm/neon-compat.h33
-rw-r--r--media/libjpeg/simd/i386/jccolext-avx2.asm578
-rw-r--r--media/libjpeg/simd/i386/jccolext-mmx.asm476
-rw-r--r--media/libjpeg/simd/i386/jccolext-sse2.asm503
-rw-r--r--media/libjpeg/simd/i386/jccolor-avx2.asm121
-rw-r--r--media/libjpeg/simd/i386/jccolor-mmx.asm121
-rw-r--r--media/libjpeg/simd/i386/jccolor-sse2.asm120
-rw-r--r--media/libjpeg/simd/i386/jcgray-avx2.asm113
-rw-r--r--media/libjpeg/simd/i386/jcgray-mmx.asm113
-rw-r--r--media/libjpeg/simd/i386/jcgray-sse2.asm112
-rw-r--r--media/libjpeg/simd/i386/jcgryext-avx2.asm457
-rw-r--r--media/libjpeg/simd/i386/jcgryext-mmx.asm355
-rw-r--r--media/libjpeg/simd/i386/jcgryext-sse2.asm382
-rw-r--r--media/libjpeg/simd/i386/jchuff-sse2.asm761
-rw-r--r--media/libjpeg/simd/i386/jcphuff-sse2.asm662
-rw-r--r--media/libjpeg/simd/i386/jcsample-avx2.asm388
-rw-r--r--media/libjpeg/simd/i386/jcsample-mmx.asm324
-rw-r--r--media/libjpeg/simd/i386/jcsample-sse2.asm351
-rw-r--r--media/libjpeg/simd/i386/jdcolext-avx2.asm515
-rw-r--r--media/libjpeg/simd/i386/jdcolext-mmx.asm404
-rw-r--r--media/libjpeg/simd/i386/jdcolext-sse2.asm458
-rw-r--r--media/libjpeg/simd/i386/jdcolor-avx2.asm118
-rw-r--r--media/libjpeg/simd/i386/jdcolor-mmx.asm117
-rw-r--r--media/libjpeg/simd/i386/jdcolor-sse2.asm117
-rw-r--r--media/libjpeg/simd/i386/jdmerge-avx2.asm136
-rw-r--r--media/libjpeg/simd/i386/jdmerge-mmx.asm123
-rw-r--r--media/libjpeg/simd/i386/jdmerge-sse2.asm135
-rw-r--r--media/libjpeg/simd/i386/jdmrgext-avx2.asm575
-rw-r--r--media/libjpeg/simd/i386/jdmrgext-mmx.asm460
-rw-r--r--media/libjpeg/simd/i386/jdmrgext-sse2.asm517
-rw-r--r--media/libjpeg/simd/i386/jdsample-avx2.asm760
-rw-r--r--media/libjpeg/simd/i386/jdsample-mmx.asm731
-rw-r--r--media/libjpeg/simd/i386/jdsample-sse2.asm724
-rw-r--r--media/libjpeg/simd/i386/jfdctflt-3dn.asm318
-rw-r--r--media/libjpeg/simd/i386/jfdctflt-sse.asm369
-rw-r--r--media/libjpeg/simd/i386/jfdctfst-mmx.asm395
-rw-r--r--media/libjpeg/simd/i386/jfdctfst-sse2.asm403
-rw-r--r--media/libjpeg/simd/i386/jfdctint-avx2.asm331
-rw-r--r--media/libjpeg/simd/i386/jfdctint-mmx.asm620
-rw-r--r--media/libjpeg/simd/i386/jfdctint-sse2.asm633
-rw-r--r--media/libjpeg/simd/i386/jidctflt-3dn.asm451
-rw-r--r--media/libjpeg/simd/i386/jidctflt-sse.asm571
-rw-r--r--media/libjpeg/simd/i386/jidctflt-sse2.asm497
-rw-r--r--media/libjpeg/simd/i386/jidctfst-mmx.asm499
-rw-r--r--media/libjpeg/simd/i386/jidctfst-sse2.asm501
-rw-r--r--media/libjpeg/simd/i386/jidctint-avx2.asm453
-rw-r--r--media/libjpeg/simd/i386/jidctint-mmx.asm851
-rw-r--r--media/libjpeg/simd/i386/jidctint-sse2.asm858
-rw-r--r--media/libjpeg/simd/i386/jidctred-mmx.asm704
-rw-r--r--media/libjpeg/simd/i386/jidctred-sse2.asm592
-rw-r--r--media/libjpeg/simd/i386/jquant-3dn.asm230
-rw-r--r--media/libjpeg/simd/i386/jquant-mmx.asm276
-rw-r--r--media/libjpeg/simd/i386/jquant-sse.asm208
-rw-r--r--media/libjpeg/simd/i386/jquantf-sse2.asm168
-rw-r--r--media/libjpeg/simd/i386/jquanti-avx2.asm188
-rw-r--r--media/libjpeg/simd/i386/jquanti-sse2.asm201
-rw-r--r--media/libjpeg/simd/i386/jsimd.c1246
-rw-r--r--media/libjpeg/simd/i386/jsimdcpu.asm135
-rw-r--r--media/libjpeg/simd/jccolext-mmx.asm476
-rw-r--r--media/libjpeg/simd/jccolext-sse2-64.asm486
-rw-r--r--media/libjpeg/simd/jccolext-sse2.asm503
-rw-r--r--media/libjpeg/simd/jccolor-altivec.c104
-rw-r--r--media/libjpeg/simd/jccolor-mmx.asm122
-rw-r--r--media/libjpeg/simd/jccolor-sse2-64.asm121
-rw-r--r--media/libjpeg/simd/jccolor-sse2.asm121
-rw-r--r--media/libjpeg/simd/jcgray-altivec.c99
-rw-r--r--media/libjpeg/simd/jcgray-mmx.asm115
-rw-r--r--media/libjpeg/simd/jcgray-sse2-64.asm114
-rw-r--r--media/libjpeg/simd/jcgray-sse2.asm114
-rw-r--r--media/libjpeg/simd/jcgryext-mmx.asm356
-rw-r--r--media/libjpeg/simd/jcgryext-sse2-64.asm365
-rw-r--r--media/libjpeg/simd/jcgryext-sse2.asm384
-rw-r--r--media/libjpeg/simd/jchuff-sse2-64.asm360
-rw-r--r--media/libjpeg/simd/jchuff-sse2.asm426
-rw-r--r--media/libjpeg/simd/jcolsamp.inc104
-rw-r--r--media/libjpeg/simd/jcsample-mmx.asm323
-rw-r--r--media/libjpeg/simd/jcsample-sse2-64.asm329
-rw-r--r--media/libjpeg/simd/jcsample-sse2.asm350
-rw-r--r--media/libjpeg/simd/jdcolext-mmx.asm404
-rw-r--r--media/libjpeg/simd/jdcolext-sse2-64.asm440
-rw-r--r--media/libjpeg/simd/jdcolext-sse2.asm459
-rw-r--r--media/libjpeg/simd/jdcolor-altivec.c96
-rw-r--r--media/libjpeg/simd/jdcolor-mmx.asm119
-rw-r--r--media/libjpeg/simd/jdcolor-sse2-64.asm119
-rw-r--r--media/libjpeg/simd/jdcolor-sse2.asm119
-rw-r--r--media/libjpeg/simd/jdmerge-altivec.c108
-rw-r--r--media/libjpeg/simd/jdmerge-mmx.asm125
-rw-r--r--media/libjpeg/simd/jdmerge-sse2-64.asm125
-rw-r--r--media/libjpeg/simd/jdmerge-sse2.asm125
-rw-r--r--media/libjpeg/simd/jdmrgext-mmx.asm463
-rw-r--r--media/libjpeg/simd/jdmrgext-sse2-64.asm537
-rw-r--r--media/libjpeg/simd/jdmrgext-sse2.asm518
-rw-r--r--media/libjpeg/simd/jdsample-mmx.asm736
-rw-r--r--media/libjpeg/simd/jdsample-sse2-64.asm670
-rw-r--r--media/libjpeg/simd/jdsample-sse2.asm728
-rw-r--r--media/libjpeg/simd/jfdctflt-3dn.asm319
-rw-r--r--media/libjpeg/simd/jfdctflt-sse-64.asm357
-rw-r--r--media/libjpeg/simd/jfdctflt-sse.asm369
-rw-r--r--media/libjpeg/simd/jfdctfst-mmx.asm396
-rw-r--r--media/libjpeg/simd/jfdctfst-sse2-64.asm391
-rw-r--r--media/libjpeg/simd/jfdctfst-sse2.asm403
-rw-r--r--media/libjpeg/simd/jfdctint-altivec.c262
-rw-r--r--media/libjpeg/simd/jfdctint-mmx.asm621
-rw-r--r--media/libjpeg/simd/jfdctint-sse2-64.asm621
-rw-r--r--media/libjpeg/simd/jfdctint-sse2.asm633
-rw-r--r--media/libjpeg/simd/jidctflt-3dn.asm451
-rw-r--r--media/libjpeg/simd/jidctflt-sse.asm571
-rw-r--r--media/libjpeg/simd/jidctflt-sse2-64.asm482
-rw-r--r--media/libjpeg/simd/jidctflt-sse2.asm497
-rw-r--r--media/libjpeg/simd/jidctfst-mmx.asm499
-rw-r--r--media/libjpeg/simd/jidctfst-sse2-64.asm491
-rw-r--r--media/libjpeg/simd/jidctfst-sse2.asm501
-rw-r--r--media/libjpeg/simd/jidctint-mmx.asm851
-rw-r--r--media/libjpeg/simd/jidctint-sse2-64.asm847
-rw-r--r--media/libjpeg/simd/jidctint-sse2.asm858
-rw-r--r--media/libjpeg/simd/jidctred-mmx.asm705
-rw-r--r--media/libjpeg/simd/jidctred-sse2-64.asm575
-rw-r--r--media/libjpeg/simd/jidctred-sse2.asm593
-rw-r--r--media/libjpeg/simd/jpeg_nbits_table.inc4097
-rw-r--r--media/libjpeg/simd/jquant-3dn.asm232
-rw-r--r--media/libjpeg/simd/jquant-mmx.asm273
-rw-r--r--media/libjpeg/simd/jquant-sse.asm210
-rw-r--r--media/libjpeg/simd/jquantf-sse2-64.asm157
-rw-r--r--media/libjpeg/simd/jquantf-sse2.asm170
-rw-r--r--media/libjpeg/simd/jquanti-sse2-64.asm186
-rw-r--r--media/libjpeg/simd/jquanti-sse2.asm199
-rw-r--r--media/libjpeg/simd/jsimd.h1501
-rw-r--r--media/libjpeg/simd/jsimd_arm.c727
-rw-r--r--media/libjpeg/simd/jsimd_arm64.c802
-rw-r--r--media/libjpeg/simd/jsimd_arm_neon.S2878
-rw-r--r--media/libjpeg/simd/jsimd_i386.c1091
-rw-r--r--media/libjpeg/simd/jsimd_mips.c1138
-rw-r--r--media/libjpeg/simd/jsimd_mips_dspr2.S4487
-rw-r--r--media/libjpeg/simd/jsimd_mips_dspr2_asm.h285
-rw-r--r--media/libjpeg/simd/jsimd_powerpc.c828
-rw-r--r--media/libjpeg/simd/jsimd_x86_64.c887
-rw-r--r--media/libjpeg/simd/jsimdcpu.asm104
-rw-r--r--media/libjpeg/simd/jsimdext.inc375
-rw-r--r--media/libjpeg/simd/mips/jsimd.c1147
-rw-r--r--media/libjpeg/simd/mips/jsimd_dspr2.S4543
-rw-r--r--media/libjpeg/simd/mips/jsimd_dspr2_asm.h292
-rw-r--r--media/libjpeg/simd/mips64/jccolext-mmi.c455
-rw-r--r--media/libjpeg/simd/mips64/jccolor-mmi.c148
-rw-r--r--media/libjpeg/simd/mips64/jcgray-mmi.c132
-rw-r--r--media/libjpeg/simd/mips64/jcgryext-mmi.c374
-rw-r--r--media/libjpeg/simd/mips64/jcsample-mmi.c98
-rw-r--r--media/libjpeg/simd/mips64/jcsample.h (renamed from media/libjpeg/simd/jcsample.h)8
-rw-r--r--media/libjpeg/simd/mips64/jdcolext-mmi.c415
-rw-r--r--media/libjpeg/simd/mips64/jdcolor-mmi.c139
-rw-r--r--media/libjpeg/simd/mips64/jdmerge-mmi.c149
-rw-r--r--media/libjpeg/simd/mips64/jdmrgext-mmi.c615
-rw-r--r--media/libjpeg/simd/mips64/jdsample-mmi.c304
-rw-r--r--media/libjpeg/simd/mips64/jfdctfst-mmi.c255
-rw-r--r--media/libjpeg/simd/mips64/jfdctint-mmi.c398
-rw-r--r--media/libjpeg/simd/mips64/jidctfst-mmi.c395
-rw-r--r--media/libjpeg/simd/mips64/jidctint-mmi.c571
-rw-r--r--media/libjpeg/simd/mips64/jquanti-mmi.c124
-rw-r--r--media/libjpeg/simd/mips64/jsimd.c870
-rw-r--r--media/libjpeg/simd/mips64/jsimd_mmi.h69
-rw-r--r--media/libjpeg/simd/mips64/loongson-mmintrin.h1334
-rw-r--r--media/libjpeg/simd/nasm/jcolsamp.inc135
-rw-r--r--media/libjpeg/simd/nasm/jdct.inc (renamed from media/libjpeg/simd/jdct.inc)18
-rw-r--r--[-rwxr-xr-x]media/libjpeg/simd/nasm/jsimdcfg.inc (renamed from media/libjpeg/simd/jsimdcfg.inc)3
-rw-r--r--media/libjpeg/simd/nasm/jsimdcfg.inc.h133
-rw-r--r--media/libjpeg/simd/nasm/jsimdext.inc520
-rw-r--r--media/libjpeg/simd/powerpc/jccolext-altivec.c (renamed from media/libjpeg/simd/jccolext-altivec.c)18
-rw-r--r--media/libjpeg/simd/powerpc/jccolor-altivec.c116
-rw-r--r--media/libjpeg/simd/powerpc/jcgray-altivec.c111
-rw-r--r--media/libjpeg/simd/powerpc/jcgryext-altivec.c (renamed from media/libjpeg/simd/jcgryext-altivec.c)19
-rw-r--r--media/libjpeg/simd/powerpc/jcsample-altivec.c (renamed from media/libjpeg/simd/jcsample-altivec.c)27
-rw-r--r--media/libjpeg/simd/powerpc/jcsample.h28
-rw-r--r--media/libjpeg/simd/powerpc/jdcolext-altivec.c (renamed from media/libjpeg/simd/jdcolext-altivec.c)12
-rw-r--r--media/libjpeg/simd/powerpc/jdcolor-altivec.c106
-rw-r--r--media/libjpeg/simd/powerpc/jdmerge-altivec.c130
-rw-r--r--media/libjpeg/simd/powerpc/jdmrgext-altivec.c (renamed from media/libjpeg/simd/jdmrgext-altivec.c)34
-rw-r--r--media/libjpeg/simd/powerpc/jdsample-altivec.c (renamed from media/libjpeg/simd/jdsample-altivec.c)78
-rw-r--r--media/libjpeg/simd/powerpc/jfdctfst-altivec.c (renamed from media/libjpeg/simd/jfdctfst-altivec.c)80
-rw-r--r--media/libjpeg/simd/powerpc/jfdctint-altivec.c258
-rw-r--r--media/libjpeg/simd/powerpc/jidctfst-altivec.c (renamed from media/libjpeg/simd/jidctfst-altivec.c)124
-rw-r--r--media/libjpeg/simd/powerpc/jidctint-altivec.c (renamed from media/libjpeg/simd/jidctint-altivec.c)302
-rw-r--r--media/libjpeg/simd/powerpc/jquanti-altivec.c (renamed from media/libjpeg/simd/jquanti-altivec.c)48
-rw-r--r--media/libjpeg/simd/powerpc/jsimd.c881
-rw-r--r--media/libjpeg/simd/powerpc/jsimd_altivec.h (renamed from media/libjpeg/simd/jsimd_altivec.h)63
-rw-r--r--media/libjpeg/simd/x86_64/jccolext-avx2.asm559
-rw-r--r--media/libjpeg/simd/x86_64/jccolext-sse2.asm484
-rw-r--r--media/libjpeg/simd/x86_64/jccolor-avx2.asm121
-rw-r--r--media/libjpeg/simd/x86_64/jccolor-sse2.asm120
-rw-r--r--media/libjpeg/simd/x86_64/jcgray-avx2.asm113
-rw-r--r--media/libjpeg/simd/x86_64/jcgray-sse2.asm112
-rw-r--r--media/libjpeg/simd/x86_64/jcgryext-avx2.asm438
-rw-r--r--media/libjpeg/simd/x86_64/jcgryext-sse2.asm363
-rw-r--r--media/libjpeg/simd/x86_64/jchuff-sse2.asm583
-rw-r--r--media/libjpeg/simd/x86_64/jcphuff-sse2.asm639
-rw-r--r--media/libjpeg/simd/x86_64/jcsample-avx2.asm367
-rw-r--r--media/libjpeg/simd/x86_64/jcsample-sse2.asm330
-rw-r--r--media/libjpeg/simd/x86_64/jdcolext-avx2.asm496
-rw-r--r--media/libjpeg/simd/x86_64/jdcolext-sse2.asm439
-rw-r--r--media/libjpeg/simd/x86_64/jdcolor-avx2.asm118
-rw-r--r--media/libjpeg/simd/x86_64/jdcolor-sse2.asm117
-rw-r--r--media/libjpeg/simd/x86_64/jdmerge-avx2.asm136
-rw-r--r--media/libjpeg/simd/x86_64/jdmerge-sse2.asm135
-rw-r--r--media/libjpeg/simd/x86_64/jdmrgext-avx2.asm596
-rw-r--r--media/libjpeg/simd/x86_64/jdmrgext-sse2.asm538
-rw-r--r--media/libjpeg/simd/x86_64/jdsample-avx2.asm696
-rw-r--r--media/libjpeg/simd/x86_64/jdsample-sse2.asm665
-rw-r--r--media/libjpeg/simd/x86_64/jfdctflt-sse.asm355
-rw-r--r--media/libjpeg/simd/x86_64/jfdctfst-sse2.asm389
-rw-r--r--media/libjpeg/simd/x86_64/jfdctint-avx2.asm320
-rw-r--r--media/libjpeg/simd/x86_64/jfdctint-sse2.asm619
-rw-r--r--media/libjpeg/simd/x86_64/jidctflt-sse2.asm482
-rw-r--r--media/libjpeg/simd/x86_64/jidctfst-sse2.asm491
-rw-r--r--media/libjpeg/simd/x86_64/jidctint-avx2.asm418
-rw-r--r--media/libjpeg/simd/x86_64/jidctint-sse2.asm847
-rw-r--r--media/libjpeg/simd/x86_64/jidctred-sse2.asm574
-rw-r--r--media/libjpeg/simd/x86_64/jquantf-sse2.asm155
-rw-r--r--media/libjpeg/simd/x86_64/jquanti-avx2.asm163
-rw-r--r--media/libjpeg/simd/x86_64/jquanti-sse2.asm188
-rw-r--r--media/libjpeg/simd/x86_64/jsimd.c1068
-rw-r--r--media/libjpeg/simd/x86_64/jsimdcpu.asm86
-rw-r--r--media/libogg/CHANGES26
-rw-r--r--media/libogg/README97
-rw-r--r--media/libogg/README.md160
-rw-r--r--media/libogg/README_MOZILLA2
-rw-r--r--media/libogg/include/crctable.h278
-rw-r--r--media/libogg/include/ogg/config_types.h13
-rw-r--r--media/libogg/include/ogg/ogg.h1
-rw-r--r--media/libogg/include/ogg/os_types.h51
-rw-r--r--media/libogg/moz.build4
-rw-r--r--media/libogg/src/ogg_bitwise.c5
-rw-r--r--media/libogg/src/ogg_framing.c215
-rwxr-xr-x[-rw-r--r--]media/libogg/update.sh6
-rw-r--r--media/libvorbis/CHANGES185
-rw-r--r--media/libvorbis/COPYING2
-rw-r--r--media/libvorbis/README134
-rw-r--r--media/libvorbis/README.md147
-rw-r--r--media/libvorbis/README_MOZILLA6
-rw-r--r--media/libvorbis/include/vorbis/codec.h3
-rw-r--r--media/libvorbis/include/vorbis/vorbisenc.h3
-rw-r--r--media/libvorbis/lib/backends.h3
-rw-r--r--media/libvorbis/lib/bitrate.h3
-rw-r--r--media/libvorbis/lib/books/coupled/res_books_51.h3
-rw-r--r--media/libvorbis/lib/books/coupled/res_books_stereo.h3
-rw-r--r--media/libvorbis/lib/books/floor/floor_books.h3
-rw-r--r--media/libvorbis/lib/books/uncoupled/res_books_uncoupled.h3
-rw-r--r--media/libvorbis/lib/codebook.h3
-rw-r--r--media/libvorbis/lib/codec_internal.h3
-rw-r--r--media/libvorbis/lib/envelope.h3
-rw-r--r--media/libvorbis/lib/highlevel.h3
-rw-r--r--media/libvorbis/lib/lookup.h3
-rw-r--r--media/libvorbis/lib/lookup_data.h3
-rw-r--r--media/libvorbis/lib/lpc.h3
-rw-r--r--media/libvorbis/lib/lsp.h3
-rw-r--r--media/libvorbis/lib/masking.h3
-rw-r--r--media/libvorbis/lib/mdct.h3
-rw-r--r--media/libvorbis/lib/misc.h3
-rw-r--r--media/libvorbis/lib/modes/floor_all.h3
-rw-r--r--media/libvorbis/lib/modes/psych_11.h3
-rw-r--r--media/libvorbis/lib/modes/psych_16.h3
-rw-r--r--media/libvorbis/lib/modes/psych_44.h3
-rw-r--r--media/libvorbis/lib/modes/psych_8.h3
-rw-r--r--media/libvorbis/lib/modes/residue_16.h3
-rw-r--r--media/libvorbis/lib/modes/residue_44.h3
-rw-r--r--media/libvorbis/lib/modes/residue_44p51.h3
-rw-r--r--media/libvorbis/lib/modes/residue_44u.h3
-rw-r--r--media/libvorbis/lib/modes/residue_8.h3
-rw-r--r--media/libvorbis/lib/modes/setup_11.h3
-rw-r--r--media/libvorbis/lib/modes/setup_16.h3
-rw-r--r--media/libvorbis/lib/modes/setup_22.h3
-rw-r--r--media/libvorbis/lib/modes/setup_32.h3
-rw-r--r--media/libvorbis/lib/modes/setup_44.h3
-rw-r--r--media/libvorbis/lib/modes/setup_44p51.h3
-rw-r--r--media/libvorbis/lib/modes/setup_44u.h3
-rw-r--r--media/libvorbis/lib/modes/setup_8.h3
-rw-r--r--media/libvorbis/lib/modes/setup_X.h3
-rw-r--r--media/libvorbis/lib/os.h16
-rw-r--r--media/libvorbis/lib/psy.h3
-rw-r--r--media/libvorbis/lib/registry.h3
-rw-r--r--media/libvorbis/lib/scales.h3
-rw-r--r--media/libvorbis/lib/smallft.h3
-rw-r--r--media/libvorbis/lib/vorbis_analysis.c3
-rw-r--r--media/libvorbis/lib/vorbis_bitrate.c3
-rw-r--r--media/libvorbis/lib/vorbis_block.c3
-rw-r--r--media/libvorbis/lib/vorbis_codebook.c3
-rw-r--r--media/libvorbis/lib/vorbis_envelope.c3
-rw-r--r--media/libvorbis/lib/vorbis_floor0.c3
-rw-r--r--media/libvorbis/lib/vorbis_floor1.c3
-rw-r--r--media/libvorbis/lib/vorbis_info.c52
-rw-r--r--media/libvorbis/lib/vorbis_lookup.c3
-rw-r--r--media/libvorbis/lib/vorbis_lpc.c3
-rw-r--r--media/libvorbis/lib/vorbis_lsp.c6
-rw-r--r--media/libvorbis/lib/vorbis_mapping0.c4
-rw-r--r--media/libvorbis/lib/vorbis_mdct.c3
-rw-r--r--media/libvorbis/lib/vorbis_psy.c27
-rw-r--r--media/libvorbis/lib/vorbis_registry.c3
-rw-r--r--media/libvorbis/lib/vorbis_res0.c6
-rw-r--r--media/libvorbis/lib/vorbis_sharedbook.c36
-rw-r--r--media/libvorbis/lib/vorbis_smallft.c3
-rw-r--r--media/libvorbis/lib/vorbis_synthesis.c5
-rw-r--r--media/libvorbis/lib/vorbis_window.c3
-rw-r--r--media/libvorbis/lib/vorbisenc.c6
-rw-r--r--media/libvorbis/lib/window.h3
-rw-r--r--media/libvorbis/todo.txt22
-rwxr-xr-x[-rw-r--r--]media/libvorbis/update.sh12
-rw-r--r--media/libwebp/AUTHORS10
-rw-r--r--media/libwebp/NEWS55
-rw-r--r--media/libwebp/README32
-rw-r--r--media/libwebp/README.mux16
-rw-r--r--media/libwebp/UXPCHANGES1
-rw-r--r--media/libwebp/dec/alpha_dec.c2
-rw-r--r--media/libwebp/dec/buffer_dec.c10
-rw-r--r--media/libwebp/dec/frame_dec.c4
-rw-r--r--media/libwebp/dec/idec_dec.c11
-rw-r--r--media/libwebp/dec/io_dec.c151
-rw-r--r--media/libwebp/dec/quant_dec.c17
-rw-r--r--media/libwebp/dec/tree_dec.c57
-rw-r--r--media/libwebp/dec/vp8_dec.c97
-rw-r--r--media/libwebp/dec/vp8i_dec.h2
-rw-r--r--media/libwebp/dec/vp8l_dec.c156
-rw-r--r--media/libwebp/dec/vp8li_dec.h20
-rw-r--r--media/libwebp/dec/webp_dec.c17
-rw-r--r--media/libwebp/dec/webpi_dec.h4
-rw-r--r--media/libwebp/demux/demux.c23
-rw-r--r--media/libwebp/dsp/alpha_processing.c73
-rw-r--r--media/libwebp/dsp/alpha_processing_mips_dsp_r2.c228
-rw-r--r--media/libwebp/dsp/alpha_processing_neon.c21
-rw-r--r--media/libwebp/dsp/alpha_processing_sse2.c46
-rw-r--r--media/libwebp/dsp/alpha_processing_sse41.c6
-rw-r--r--media/libwebp/dsp/cost.c411
-rw-r--r--media/libwebp/dsp/cost_mips32.c154
-rw-r--r--media/libwebp/dsp/cost_mips_dsp_r2.c107
-rw-r--r--media/libwebp/dsp/cost_neon.c122
-rw-r--r--media/libwebp/dsp/cost_sse2.c119
-rw-r--r--media/libwebp/dsp/cpu.c253
-rw-r--r--media/libwebp/dsp/dec.c6
-rw-r--r--media/libwebp/dsp/dec_mips32.c587
-rw-r--r--media/libwebp/dsp/dec_mips_dsp_r2.c994
-rw-r--r--media/libwebp/dsp/dec_msa.c1020
-rw-r--r--media/libwebp/dsp/dec_neon.c73
-rw-r--r--media/libwebp/dsp/dec_sse2.c14
-rw-r--r--media/libwebp/dsp/dsp.h117
-rw-r--r--media/libwebp/dsp/enc.c830
-rw-r--r--media/libwebp/dsp/enc_mips32.c677
-rw-r--r--media/libwebp/dsp/enc_mips_dsp_r2.c1517
-rw-r--r--media/libwebp/dsp/enc_msa.c896
-rw-r--r--media/libwebp/dsp/enc_neon.c938
-rw-r--r--media/libwebp/dsp/enc_sse2.c1381
-rw-r--r--media/libwebp/dsp/enc_sse41.c339
-rw-r--r--media/libwebp/dsp/filters.c16
-rw-r--r--media/libwebp/dsp/filters_mips_dsp_r2.c402
-rw-r--r--media/libwebp/dsp/filters_msa.c202
-rw-r--r--media/libwebp/dsp/filters_sse2.c21
-rw-r--r--media/libwebp/dsp/lossless.c111
-rw-r--r--media/libwebp/dsp/lossless.h34
-rw-r--r--media/libwebp/dsp/lossless_common.h15
-rw-r--r--media/libwebp/dsp/lossless_enc.c948
-rw-r--r--media/libwebp/dsp/lossless_enc_mips32.c397
-rw-r--r--media/libwebp/dsp/lossless_enc_mips_dsp_r2.c281
-rw-r--r--media/libwebp/dsp/lossless_enc_msa.c148
-rw-r--r--media/libwebp/dsp/lossless_enc_neon.c144
-rw-r--r--media/libwebp/dsp/lossless_enc_sse2.c669
-rw-r--r--media/libwebp/dsp/lossless_enc_sse41.c155
-rw-r--r--media/libwebp/dsp/lossless_mips_dsp_r2.c701
-rw-r--r--media/libwebp/dsp/lossless_msa.c356
-rw-r--r--media/libwebp/dsp/lossless_neon.c20
-rw-r--r--media/libwebp/dsp/lossless_sse2.c45
-rw-r--r--media/libwebp/dsp/lossless_sse41.c132
-rw-r--r--media/libwebp/dsp/moz.build123
-rw-r--r--media/libwebp/dsp/msa_macro.h5
-rw-r--r--media/libwebp/dsp/neon.h7
-rw-r--r--media/libwebp/dsp/quant.h15
-rw-r--r--media/libwebp/dsp/rescaler.c27
-rw-r--r--media/libwebp/dsp/rescaler_mips32.c295
-rw-r--r--media/libwebp/dsp/rescaler_mips_dsp_r2.c314
-rw-r--r--media/libwebp/dsp/rescaler_msa.c443
-rw-r--r--media/libwebp/dsp/rescaler_neon.c32
-rw-r--r--media/libwebp/dsp/rescaler_sse2.c60
-rw-r--r--media/libwebp/dsp/ssim.c159
-rw-r--r--media/libwebp/dsp/ssim_sse2.c165
-rw-r--r--media/libwebp/dsp/upsampling.c10
-rw-r--r--media/libwebp/dsp/upsampling_mips_dsp_r2.c291
-rw-r--r--media/libwebp/dsp/upsampling_msa.c688
-rw-r--r--media/libwebp/dsp/upsampling_neon.c14
-rw-r--r--media/libwebp/dsp/yuv.c20
-rw-r--r--media/libwebp/dsp/yuv.h2
-rw-r--r--media/libwebp/dsp/yuv_mips32.c103
-rw-r--r--media/libwebp/dsp/yuv_mips_dsp_r2.c134
-rw-r--r--media/libwebp/enc/alpha_enc.c443
-rw-r--r--media/libwebp/enc/analysis_enc.c475
-rw-r--r--media/libwebp/enc/backward_references_cost_enc.c790
-rw-r--r--media/libwebp/enc/backward_references_enc.c1030
-rw-r--r--media/libwebp/enc/backward_references_enc.h20
-rw-r--r--media/libwebp/enc/config_enc.c157
-rw-r--r--media/libwebp/enc/cost_enc.c342
-rw-r--r--media/libwebp/enc/delta_palettization_enc.h25
-rw-r--r--media/libwebp/enc/filter_enc.c235
-rw-r--r--media/libwebp/enc/frame_enc.c899
-rw-r--r--media/libwebp/enc/histogram_enc.c1252
-rw-r--r--media/libwebp/enc/histogram_enc.h6
-rw-r--r--media/libwebp/enc/iterator_enc.c459
-rw-r--r--media/libwebp/enc/moz.build39
-rw-r--r--media/libwebp/enc/near_lossless_enc.c151
-rw-r--r--media/libwebp/enc/picture_csp_enc.c1210
-rw-r--r--media/libwebp/enc/picture_enc.c296
-rw-r--r--media/libwebp/enc/picture_psnr_enc.c258
-rw-r--r--media/libwebp/enc/picture_rescale_enc.c316
-rw-r--r--media/libwebp/enc/picture_tools_enc.c273
-rw-r--r--media/libwebp/enc/predictor_enc.c772
-rw-r--r--media/libwebp/enc/quant_enc.c1388
-rw-r--r--media/libwebp/enc/syntax_enc.c388
-rw-r--r--media/libwebp/enc/token_enc.c262
-rw-r--r--media/libwebp/enc/tree_enc.c504
-rw-r--r--media/libwebp/enc/vp8i_enc.h13
-rw-r--r--media/libwebp/enc/vp8l_enc.c2138
-rw-r--r--media/libwebp/enc/vp8li_enc.h4
-rw-r--r--media/libwebp/enc/webp_enc.c410
-rw-r--r--media/libwebp/moz.build3
-rw-r--r--media/libwebp/moz/cpu.cpp2
-rwxr-xr-x[-rw-r--r--]media/libwebp/update.sh50
-rw-r--r--media/libwebp/utils/bit_reader_inl_utils.h14
-rw-r--r--media/libwebp/utils/bit_reader_utils.c86
-rw-r--r--media/libwebp/utils/bit_reader_utils.h33
-rw-r--r--media/libwebp/utils/bit_writer_utils.c347
-rw-r--r--media/libwebp/utils/color_cache_utils.c22
-rw-r--r--media/libwebp/utils/color_cache_utils.h10
-rw-r--r--media/libwebp/utils/huffman_encode_utils.c416
-rw-r--r--media/libwebp/utils/huffman_encode_utils.h2
-rw-r--r--media/libwebp/utils/huffman_utils.c26
-rw-r--r--media/libwebp/utils/huffman_utils.h2
-rw-r--r--media/libwebp/utils/moz.build2
-rw-r--r--media/libwebp/utils/quant_levels_dec_utils.c2
-rw-r--r--media/libwebp/utils/rescaler_utils.c122
-rw-r--r--media/libwebp/utils/rescaler_utils.h13
-rw-r--r--media/libwebp/utils/thread_utils.c14
-rw-r--r--media/libwebp/utils/utils.c22
-rw-r--r--media/libwebp/utils/utils.h34
-rw-r--r--media/libwebp/webp/config.h37
-rw-r--r--media/libwebp/webp/decode.h9
-rw-r--r--media/libwebp/webp/encode.h21
-rw-r--r--media/libwebp/webp/mux.h12
-rw-r--r--media/libwebp/webp/mux_types.h10
-rw-r--r--media/libwebp/webp/types.h18
-rw-r--r--[-rwxr-xr-x]media/update-libjpeg.sh6
-rw-r--r--netwerk/cache2/OldWrappers.cpp2
-rw-r--r--netwerk/cache2/OldWrappers.h2
-rw-r--r--netwerk/dns/nsIDNService.cpp5
561 files changed, 109603 insertions, 53344 deletions
diff --git a/dom/plugins/ipc/PluginInstanceParent.cpp b/dom/plugins/ipc/PluginInstanceParent.cpp
index 372a7a238b..ee88819d56 100644
--- a/dom/plugins/ipc/PluginInstanceParent.cpp
+++ b/dom/plugins/ipc/PluginInstanceParent.cpp
@@ -125,7 +125,6 @@ PluginInstanceParent::PluginInstanceParent(PluginModuleParent* parent,
, mNPNIface(npniface)
, mWindowType(NPWindowTypeWindow)
, mDrawingModel(kDefaultDrawingModel)
- , mLastRecordedDrawingModel(-1)
, mFrameID(0)
#if defined(OS_WIN)
, mPluginHWND(nullptr)
diff --git a/dom/plugins/ipc/PluginInstanceParent.h b/dom/plugins/ipc/PluginInstanceParent.h
index cb85378db6..637f82dcc1 100644
--- a/dom/plugins/ipc/PluginInstanceParent.h
+++ b/dom/plugins/ipc/PluginInstanceParent.h
@@ -399,11 +399,6 @@ private:
NPWindowType mWindowType;
int16_t mDrawingModel;
- // Since plugins may request different drawing models to find a compatible
- // one, we only record the drawing model after a SetWindow call and if the
- // drawing model has changed.
- int mLastRecordedDrawingModel;
-
nsDataHashtable<nsPtrHashKey<NPObject>, PluginScriptableObjectParent*> mScriptableObjects;
// This is used to tell the compositor that it should invalidate the ImageLayer.
diff --git a/intl/unicharutil/tools/genUnicodePropertyData.pl b/intl/unicharutil/tools/genUnicodePropertyData.pl
index 6107737b38..8c7437f82d 100755
--- a/intl/unicharutil/tools/genUnicodePropertyData.pl
+++ b/intl/unicharutil/tools/genUnicodePropertyData.pl
@@ -9,6 +9,10 @@
# read from the Unicode Character Database and compiled into multi-level arrays
# for efficient lookup.
#
+# Note that for most properties, we now rely on ICU; this tool and the tables
+# it generates are used only for a couple of properties not readily exposed
+# via ICU APIs.
+#
# To regenerate the tables in nsUnicodePropertyData.cpp:
#
# (1) Download the current Unicode data files from
@@ -17,13 +21,6 @@
#
# NB: not all the files are actually needed; currently, we require
# - UnicodeData.txt
-# - Scripts.txt
-# - BidiMirroring.txt
-# - BidiBrackets.txt
-# - HangulSyllableType.txt
-# - LineBreak.txt
-# - EastAsianWidth.txt
-# - DerivedCoreProperties.txt
# - ReadMe.txt (to record version/date of the UCD)
# - Unihan_Variants.txt (from Unihan.zip)
# though this may change if we find a need for additional properties.
@@ -32,12 +29,11 @@
#
# We also require the file
# http://www.unicode.org/Public/security/latest/IdentifierStatus.txt
-# http://www.unicode.org/Public/security/latest/IdentifierType.txt
# This file should be in a sub-directory "security" immediately below the
# directory containing the other Unicode data files.
#
-# We also require the latest data file for UTR50, currently revision-16:
-# http://www.unicode.org/Public/vertical/revision-16/VerticalOrientation-16.txt
+# We also require the latest data file for UTR50, currently revision-17:
+# http://www.unicode.org/Public/vertical/revision-17/VerticalOrientation-17.txt
# This file should be in a sub-directory "vertical" immediately below the
# directory containing the other Unicode data files.
#
@@ -45,7 +41,6 @@
# (2) Run this tool using a command line of the form
#
# perl genUnicodePropertyData.pl \
-# /path/to/harfbuzz/src \
# /path/to/icu/common/unicode \
# /path/to/UCD-directory
#
@@ -59,17 +54,15 @@
use strict;
use List::Util qw(first);
-if ($#ARGV != 2) {
+if ($#ARGV != 1) {
print <<__EOT;
# Run this tool using a command line of the form
#
# perl genUnicodePropertyData.pl \\
-# /path/to/harfbuzz/src \\
# /path/to/icu/common/unicode \\
# /path/to/UCD-directory
#
-# where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
-# icu/common/unicode is the directory containing ICU 'common' public headers,
+# where icu/common/unicode is the directory containing ICU 'common' headers,
# and UCD-directory is a directory containing the current Unicode Character
# Database files (UnicodeData.txt, etc), available from
# http://www.unicode.org/Public/UNIDATA/, with additional resources as
@@ -85,35 +78,11 @@ __EOT
exit 0;
}
-my $HARFBUZZ = $ARGV[0];
-my $ICU = $ARGV[1];
-my $UNICODE = $ARGV[2];
-
-# load HB_Category constants
-
-my $cc = -1;
-my %catCode;
-
-sub readHarfBuzzHeader
-{
- my $file = shift;
- open FH, "< $HARFBUZZ/$file" or die "can't open harfbuzz header $HARFBUZZ/$file\n";
- while (<FH>) {
- if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
- $cc++;
- $catCode{$1} = $cc;
- }
- }
- close FH;
-}
+my $ICU = $ARGV[0];
+my $UNICODE = $ARGV[1];
-&readHarfBuzzHeader("hb-unicode.h");
-
-die "didn't find HarfBuzz category codes\n" if $cc == -1;
-
-my %scriptCode;
-my @scriptCodeToTag;
my @scriptCodeToName;
+my @idtype;
my $sc = -1;
@@ -130,8 +99,6 @@ sub readIcuHeader
s/SIGN_WRITING/SIGNWRITING/;
if (m|USCRIPT_([A-Z_]+)\s*=\s*([0-9]+),\s*/\*\s*([A-Z][a-z]{3})\s*\*/|) {
$sc = $2;
- $scriptCode{$1} = $sc;
- $scriptCodeToTag[$sc] = $3;
$scriptCodeToName[$sc] = $1;
}
}
@@ -168,35 +135,7 @@ my %idType = (
# These match the IdentifierType enum in nsUnicodeProperties.h.
my %mappedIdType = (
"Restricted" => 0,
- "Allowed" => 1,
- "Aspirational" => 2 # for Aspirational characters that are not excluded
- # by another attribute.
-);
-
-my %bidicategoryCode = (
- "L" => 0, # Left-to-Right
- "R" => 1, # Right-to-Left
- "EN" => 2, # European Number
- "ES" => 3, # European Number Separator
- "ET" => 4, # European Number Terminator
- "AN" => 5, # Arabic Number
- "CS" => 6, # Common Number Separator
- "B" => 7, # Paragraph Separator
- "S" => 8, # Segment Separator
- "WS" => 9, # Whitespace
- "ON" => 10, # Other Neutrals
- "LRE" => 11, # Left-to-Right Embedding
- "LRO" => 12, # Left-to-Right Override
- "AL" => 13, # Right-to-Left Arabic
- "RLE" => 14, # Right-to-Left Embedding
- "RLO" => 15, # Right-to-Left Override
- "PDF" => 16, # Pop Directional Format
- "NSM" => 17, # Non-Spacing Mark
- "BN" => 18, # Boundary Neutral
- "FSI" => 19, # First Strong Isolate
- "LRI" => 20, # Left-to-Right Isolate
- "RLI" => 21, # Right-to-left Isolate
- "PDI" => 22 # Pop Direcitonal Isolate
+ "Allowed" => 1
);
my %verticalOrientationCode = (
@@ -206,141 +145,18 @@ my %verticalOrientationCode = (
'Tr' => 3 # Tr - Transformed typographically, with fallback to Rotated
);
-my %lineBreakCode = ( # ordering matches ICU's ULineBreak enum
- "XX" => 0,
- "AI" => 1,
- "AL" => 2,
- "B2" => 3,
- "BA" => 4,
- "BB" => 5,
- "BK" => 6,
- "CB" => 7,
- "CL" => 8,
- "CM" => 9,
- "CR" => 10,
- "EX" => 11,
- "GL" => 12,
- "HY" => 13,
- "ID" => 14,
- "IN" => 15,
- "IS" => 16,
- "LF" => 17,
- "NS" => 18,
- "NU" => 19,
- "OP" => 20,
- "PO" => 21,
- "PR" => 22,
- "QU" => 23,
- "SA" => 24,
- "SG" => 25,
- "SP" => 26,
- "SY" => 27,
- "ZW" => 28,
- "NL" => 29,
- "WJ" => 30,
- "H2" => 31,
- "H3" => 32,
- "JL" => 33,
- "JT" => 34,
- "JV" => 35,
- "CP" => 36,
- "CJ" => 37,
- "HL" => 38,
- "RI" => 39,
- "EB" => 40,
- "EM" => 41,
- "ZWJ" => 42
-);
-
-my %eastAsianWidthCode = (
- "N" => 0,
- "A" => 1,
- "H" => 2,
- "W" => 3,
- "F" => 4,
- "Na" => 5
-);
-
# initialize default properties
-my @script;
-my @category;
-my @combining;
-my @mirror;
-my @pairedBracketType;
-my @hangul;
-my @casemap;
-my @idtype;
-my @numericvalue;
my @hanVariant;
-my @bidicategory;
my @fullWidth;
my @fullWidthInverse;
my @verticalOrientation;
-my @lineBreak;
-my @eastAsianWidthFWH;
-my @defaultIgnorable;
for (my $i = 0; $i < 0x110000; ++$i) {
- $script[$i] = $scriptCode{"UNKNOWN"};
- $category[$i] = $catCode{"UNASSIGNED"};
- $combining[$i] = 0;
- $pairedBracketType[$i] = 0;
- $casemap[$i] = 0;
- $idtype[$i] = $mappedIdType{'Restricted'};
- $numericvalue[$i] = -1;
$hanVariant[$i] = 0;
- $bidicategory[$i] = $bidicategoryCode{"L"};
$fullWidth[$i] = 0;
$fullWidthInverse[$i] = 0;
$verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R'
- $lineBreak[$i] = $lineBreakCode{"XX"};
- $eastAsianWidthFWH[$i] = 0;
- $defaultIgnorable[$i] = 0;
}
-# blocks where the default for bidi category is not L
-for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) {
- $bidicategory[$i] = $bidicategoryCode{"AL"};
-}
-for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) {
- $bidicategory[$i] = $bidicategoryCode{"R"};
-}
-for my $i (0x20A0..0x20CF) {
- $bidicategory[$i] = $bidicategoryCode{"ET"};
-}
-
-my %ucd2hb = (
-'Cc' => 'CONTROL',
-'Cf' => 'FORMAT',
-'Cn' => 'UNASSIGNED',
-'Co' => 'PRIVATE_USE',
-'Cs' => 'SURROGATE',
-'Ll' => 'LOWERCASE_LETTER',
-'Lm' => 'MODIFIER_LETTER',
-'Lo' => 'OTHER_LETTER',
-'Lt' => 'TITLECASE_LETTER',
-'Lu' => 'UPPERCASE_LETTER',
-'Mc' => 'SPACING_MARK',
-'Me' => 'ENCLOSING_MARK',
-'Mn' => 'NON_SPACING_MARK',
-'Nd' => 'DECIMAL_NUMBER',
-'Nl' => 'LETTER_NUMBER',
-'No' => 'OTHER_NUMBER',
-'Pc' => 'CONNECT_PUNCTUATION',
-'Pd' => 'DASH_PUNCTUATION',
-'Pe' => 'CLOSE_PUNCTUATION',
-'Pf' => 'FINAL_PUNCTUATION',
-'Pi' => 'INITIAL_PUNCTUATION',
-'Po' => 'OTHER_PUNCTUATION',
-'Ps' => 'OPEN_PUNCTUATION',
-'Sc' => 'CURRENCY_SYMBOL',
-'Sk' => 'MODIFIER_SYMBOL',
-'Sm' => 'MATH_SYMBOL',
-'So' => 'OTHER_SYMBOL',
-'Zl' => 'LINE_SEPARATOR',
-'Zp' => 'PARAGRAPH_SEPARATOR',
-'Zs' => 'SPACE_SEPARATOR'
-);
-
# read ReadMe.txt
my @versionInfo;
open FH, "< $UNICODE/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
@@ -350,12 +166,6 @@ while (<FH>) {
}
close FH;
-my $kTitleToUpper = 0x80000000;
-my $kUpperToLower = 0x40000000;
-my $kLowerToTitle = 0x20000000;
-my $kLowerToUpper = 0x10000000;
-my $kCaseMapCharMask = 0x001fffff;
-
# read UnicodeData.txt
open FH, "< $UNICODE/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
while (<FH>) {
@@ -368,12 +178,6 @@ while (<FH>) {
if ($fields[1] =~ /Last/) {
my $last = hex "0x$fields[0]";
do {
- $category[$first] = $catCode{$ucd2hb{$fields[2]}};
- $combining[$first] = $fields[3];
- $bidicategory[$first] = $bidicategoryCode{$fields[4]};
- unless (length($fields[7]) == 0) {
- $numericvalue[$first] = $fields[7];
- }
if ($fields[1] =~ /CJK/) {
@hanVariant[$first] = 3;
}
@@ -384,33 +188,6 @@ while (<FH>) {
}
} else {
my $usv = hex "0x$fields[0]";
- $category[$usv] = $catCode{$ucd2hb{$fields[2]}};
- $combining[$usv] = $fields[3];
- my $upper = hex $fields[12];
- my $lower = hex $fields[13];
- my $title = hex $fields[14];
- # we only store one mapping for each character,
- # but also record what kind of mapping it is
- if ($upper && $lower) {
- $casemap[$usv] |= $kTitleToUpper;
- $casemap[$usv] |= ($usv ^ $upper);
- }
- elsif ($lower) {
- $casemap[$usv] |= $kUpperToLower;
- $casemap[$usv] |= ($usv ^ $lower);
- }
- elsif ($title && ($title != $upper)) {
- $casemap[$usv] |= $kLowerToTitle;
- $casemap[$usv] |= ($usv ^ $title);
- }
- elsif ($upper) {
- $casemap[$usv] |= $kLowerToUpper;
- $casemap[$usv] |= ($usv ^ $upper);
- }
- $bidicategory[$usv] = $bidicategoryCode{$fields[4]};
- unless (length($fields[7]) == 0) {
- $numericvalue[$usv] = $fields[7];
- }
if ($fields[1] =~ /CJK/) {
@hanVariant[$usv] = 3;
}
@@ -430,176 +207,6 @@ while (<FH>) {
}
close FH;
-# read Scripts.txt
-open FH, "< $UNICODE/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
- my $script = uc($3);
- warn "unknown ICU script $script" unless exists $scriptCode{$script};
- my $script = $scriptCode{$script};
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- for (my $i = $start; $i <= $end; ++$i) {
- $script[$i] = $script;
- }
- }
-}
-close FH;
-
-# read BidiMirroring.txt
-my @offsets = ();
-push @offsets, 0;
-
-open FH, "< $UNICODE/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) {
- my $mirrorOffset = hex("0x$2") - hex("0x$1");
- my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets;
- if ($offsetIndex == undef) {
- die "too many offset codes\n" if scalar @offsets == 31;
- push @offsets, $mirrorOffset;
- $offsetIndex = $#offsets;
- }
- $mirror[hex "0x$1"] = $offsetIndex;
- }
-}
-close FH;
-
-# read BidiBrackets.txt
-my %pairedBracketTypeCode = (
- 'N' => 0,
- 'O' => 1,
- 'C' => 2
-);
-open FH, "< $UNICODE/BidiBrackets.txt" or die "can't open UCD file BidiBrackets.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6});\s*(.)/) {
- my $mirroredChar = $offsets[$mirror[hex "0x$1"]] + hex "0x$1";
- die "bidi bracket does not match mirrored char\n" unless $mirroredChar == hex "0x$2";
- my $pbt = uc($3);
- warn "unknown Bidi Bracket type" unless exists $pairedBracketTypeCode{$pbt};
- $pairedBracketType[hex "0x$1"] = $pairedBracketTypeCode{$pbt};
- }
-}
-close FH;
-
-# read HangulSyllableType.txt
-my %hangulType = (
- 'L' => 0x01,
- 'V' => 0x02,
- 'T' => 0x04,
- 'LV' => 0x03,
- 'LVT' => 0x07
-);
-open FH, "< $UNICODE/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
- my $hangul = uc($3);
- warn "unknown Hangul syllable type" unless exists $hangulType{$hangul};
- $hangul = $hangulType{$hangul};
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- for (my $i = $start; $i <= $end; ++$i) {
- $hangul[$i] = $hangul;
- }
- }
-}
-close FH;
-
-# read LineBreak.txt
-open FH, "< $UNICODE/LineBreak.txt" or die "can't open UCD file LineBreak.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
- my $lb = uc($3);
- warn "unknown LineBreak class" unless exists $lineBreakCode{$lb};
- $lb = $lineBreakCode{$lb};
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- for (my $i = $start; $i <= $end; ++$i) {
- $lineBreak[$i] = $lb;
- }
- }
-}
-close FH;
-
-# read EastAsianWidth.txt
-open FH, "< $UNICODE/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- my $eaw = $3;
- warn "unknown EastAsianWidth class" unless exists $eastAsianWidthCode{$eaw};
- my $isFWH = ($eaw =~ m/^[FWH]$/) ? 1 : 0;
- for (my $i = $start; $i <= $end; ++$i) {
- $eastAsianWidthFWH[$i] = $isFWH;
- }
- }
-}
-close FH;
-
-# read DerivedCoreProperties.txt (for Default-Ignorables)
-open FH, "< $UNICODE/DerivedCoreProperties.txt" or die "can't open UCD file DerivedCoreProperties.txt\n";
-push @versionInfo, "";
-
-while (<FH>) {
- chomp;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- s/#.*//;
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*Default_Ignorable_Code_Point/) {
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- for (my $i = $start; $i <= $end; ++$i) {
- $defaultIgnorable[$i] = 1;
- }
- }
-}
-close FH;
-
# read IdentifierStatus.txt
open FH, "< $UNICODE/security/IdentifierStatus.txt" or die "can't open UCD file IdentifierStatus.txt\n";
push @versionInfo, "";
@@ -623,33 +230,6 @@ while (<FH>) {
}
close FH;
-# read IdentifierType.txt, to find Aspirational characters
-open FH, "< $UNICODE/security/IdentifierType.txt" or die "can't open UCD file IdentifierType.txt\n";
-push @versionInfo, "";
-while (<FH>) {
- chomp;
- s/\xef\xbb\xbf//;
- push @versionInfo, $_;
- last if /Date:/;
-}
-while (<FH>) {
- if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^#]+)/) {
- my $idtype = $3;
- foreach (split(/ /, $idtype)) {
- warn "unknown Identifier Type $_" unless exists $idType{$_};
- }
- my $start = hex "0x$1";
- my $end = (defined $2) ? hex "0x$2" : $start;
- if ($idtype =~ /Aspirational/ and (not $idtype =~ /Exclusion|Not_XID|Not_NFKC/)) {
-
- for (my $i = $start; $i <= $end; ++$i) {
- $idtype[$i] = $mappedIdType{'Aspirational'};
- }
- }
- }
-}
-close FH;
-
open FH, "< $UNICODE/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
push @versionInfo, "";
while (<FH>) {
@@ -686,8 +266,8 @@ while (<FH>) {
}
close FH;
-# read VerticalOrientation-16.txt
-open FH, "< $UNICODE/vertical/VerticalOrientation-16.txt" or die "can't open UTR50 data file VerticalOrientation-16.txt\n";
+# read VerticalOrientation-17.txt
+open FH, "< $UNICODE/vertical/VerticalOrientation-17.txt" or die "can't open UTR50 data file VerticalOrientation-17.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
@@ -785,8 +365,7 @@ struct nsCharProps2 {
unsigned char mIdType:2;
};
|;
-&genTables("", "",
- "CharProp2", $type, "nsCharProps2", 9, 7, \&sprintCharProps2, 16, 1, 1);
+&genTables("CharProp2", $type, "nsCharProps2", 9, 7, \&sprintCharProps2, 16, 1, 1);
print HEADER "#pragma pack()\n\n";
@@ -802,42 +381,32 @@ sub sprintHanVariants
return sprintf("0x%02x,", $val);
}
## Han Variant data currently unused but may be needed in future, see bug 857481
-## &genTables("", "", "HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
+## &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
sub sprintFullWidth
{
my $usv = shift;
return sprintf("0x%04x,", $fullWidth[$usv]);
}
-&genTables("", "", "FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
+&genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
sub sprintFullWidthInverse
{
my $usv = shift;
return sprintf("0x%04x,", $fullWidthInverse[$usv]);
}
-&genTables("", "", "FullWidthInverse", "", "uint16_t", 10, 6, \&sprintFullWidthInverse, 0, 2, 1);
+&genTables("FullWidthInverse", "", "uint16_t", 10, 6, \&sprintFullWidthInverse, 0, 2, 1);
print STDERR "Total data = $totalData\n";
-printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper;
-printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower;
-printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle;
-printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper;
-printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask;
-
sub genTables
{
- my ($guardBegin, $guardEnd,
- $prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
+ my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
if ($typedef ne '') {
- print HEADER "$guardBegin\n";
print HEADER "$typedef\n";
- print HEADER "$guardEnd\n\n";
}
- print DATA_TABLES "\n$guardBegin\n";
print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n";
print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
print DATA_TABLES "#define k${prefix}CharBits $charBits\n";
@@ -906,7 +475,6 @@ sub genTables
print DATA_TABLES $i < $#char ? "},\n" : "}\n";
}
print DATA_TABLES "};\n";
- print DATA_TABLES "$guardEnd\n";
my $dataSize = $pmCount * $indexLen * $pmBits/8 +
$chCount * $pageLen * $bytesPerEntry +
@@ -926,7 +494,7 @@ close DATA_TABLES;
print HEADER "namespace mozilla {\n";
print HEADER "namespace unicode {\n";
-print HEADER "enum class Script {\n";
+print HEADER "enum class Script : int16_t {\n";
for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) {
print HEADER " ", $scriptCodeToName[$i], " = ", $i, ",\n";
}
diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h
index 2ff69d19a5..d3c4717b9a 100644
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@@ -44,7 +44,6 @@ enum PairedBracketType {
enum IdentifierType {
IDTYPE_RESTRICTED = 0,
IDTYPE_ALLOWED = 1,
- IDTYPE_ASPIRATIONAL = 2,
};
enum EmojiPresentation {
diff --git a/intl/unicharutil/util/nsUnicodePropertyData.cpp b/intl/unicharutil/util/nsUnicodePropertyData.cpp
index fc730ac5b5..dccf14bcd2 100644
--- a/intl/unicharutil/util/nsUnicodePropertyData.cpp
+++ b/intl/unicharutil/util/nsUnicodePropertyData.cpp
@@ -11,13 +11,12 @@
*/
/*
- * Created on Wed Jun 22 10:09:48 2022 from UCD data files with version info:
+ * Created on Thu Jun 23 07:44:34 2022 from UCD data files with version info:
*
# Unicode Character Database
-# Date: 2016-06-20, 14:59:00 GMT [KW]
-# © 2016 Unicode®, Inc.
-# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# Date: 2017-06-18, 23:32:00 GMT [KW]
+# © 2017 Unicode®, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# For documentation, see the following:
@@ -25,44 +24,20 @@
# UAX #38, "Unicode Han Database (Unihan)"
# UAX #44, "Unicode Character Database."
#
-# The UAXes can be accessed at http://www.unicode.org/versions/Unicode9.0.0/
+# The UAXes can be accessed at http://www.unicode.org/versions/Unicode10.0.0/
This directory contains the final data files
-for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.
-
-# Scripts-9.0.0.txt
-# Date: 2016-06-01, 10:34:37 GMT
-
-# BidiMirroring-9.0.0.txt
-# Date: 2016-01-21, 22:00:00 GMT [KW, LI]
-
-# BidiBrackets-9.0.0.txt
-# Date: 2016-06-07, 22:30:00 GMT [AG, LI, KW]
-
-# HangulSyllableType-9.0.0.txt
-# Date: 2016-03-02, 18:55:01 GMT
-
-# LineBreak-9.0.0.txt
-# Date: 2016-05-26, 01:00:00 GMT [KW, LI]
-
-# EastAsianWidth-9.0.0.txt
-# Date: 2016-05-27, 17:00:00 GMT [KW, LI]
-
-# DerivedCoreProperties-9.0.0.txt
-# Date: 2016-06-01, 10:34:24 GMT
+for the Unicode Character Database, for Version 10.0.0 of the Unicode Standard.
# IdentifierStatus.txt
-# Date: 2016-06-16, 13:41:30 GMT
-
-# IdentifierType.txt
-# Date: 2016-06-16, 13:41:30 GMT
+# Date: 2017-04-08, 16:13:41 GMT
#
# Unihan_Variants.txt
-# Date: 2016-06-01 07:01:48 GMT [JHJ]
+# Date: 2017-05-14 07:01:48 GMT [JHJ]
-# VerticalOrientation-16.txt
-# Date: 2016-07-23, 01:00:00 GMT [EM, KI, LI]
+# VerticalOrientation-17.txt
+# Date: 2016-10-20, 07:00:00 GMT [EM, KI, LI]
*
* * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
@@ -71,22 +46,20 @@ for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.
#include <stdint.h>
#include "harfbuzz/hb.h"
-
-
#define kCharProp2MaxPlane 16
#define kCharProp2IndexBits 9
#define kCharProp2CharBits 7
static const uint8_t sCharProp2Planes[16] = {1,2,3,4,4,4,4,4,4,4,4,4,4,4,3,3};
static const uint8_t sCharProp2Pages[5][512] = {
- {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,34,35,36,37,38,39,40,40,40,41,16,16,42,43,44,16,16,16,16,16,16,16,45,16,16,46,47,48,49,50,51,52,53,54,16,55,56,57,34,16,58,59,34,60,61,16,16,16,16,16,16,62,63,16,16,64,65,16,34,34,34,66,67,68,69,34,34,70,34,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,72,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,73,40,40,40,40,40,40,40,40,40,74,16,16,75,76,77,78,16,16,79,80,81,16,82,16,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,83,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,84,34,16,16,16,16,16,16,85,16,86,87},
- {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,88,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,34,34,34,89,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,90,91,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,16,16,34,16,16,16,16,16,16,16,16,16,34,34,34,34,34,89,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,92,34,34,34,34,34,34,34,34,34,34,34,16,16,34,34,16,16,16,16,16,16,16,16,16,16,16,16},
- {71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,93,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,94,71,95,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,96,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,97},
- {34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,97},
+ {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,34,35,36,37,38,39,34,34,34,34,16,16,40,16,41,16,16,16,16,16,16,16,42,16,16,43,44,45,46,47,48,49,50,51,16,52,53,54,34,16,55,56,34,57,58,16,16,16,16,16,16,59,60,16,16,61,62,16,34,34,34,63,64,65,66,34,34,67,34,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,69,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,70,34,34,34,34,34,34,34,34,34,71,16,16,72,73,74,75,16,16,76,77,78,16,79,16,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,80,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,81,34,16,16,16,16,16,16,82,16,83,84},
+ {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,85,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,34,34,34,86,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,76,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,87,68,88,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,16,16,34,16,16,16,16,16,16,16,16,16,34,34,34,34,34,86,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,89,34,34,34,34,34,34,34,34,34,34,34,16,16,34,34,16,16,16,16,16,16,16,16,16,16,16,16},
+ {68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,90,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,91,68,92,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,93,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,94,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,95},
+ {34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,95},
{16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16}
};
-static const nsCharProps2 sCharProp2Values[98][128] = {
+static const nsCharProps2 sCharProp2Values[96][128] = {
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{1,0},{0,0},{1,0},{1,0},{1,0},{1,0},{0,0},{1,0},{1,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{0,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{0,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
{{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0}},
@@ -106,14 +79,14 @@ static const nsCharProps2 sCharProp2Values[98][128] = {
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
- {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
+ {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0}},
{{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
- {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
+ {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
{{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
- {{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
+ {{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
{{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
@@ -126,12 +99,9 @@ static const nsCharProps2 sCharProp2Values[98][128] = {
{{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
{{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
- {{1,0},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2}},
- {{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2}},
- {{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,0},{0,0},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2}},
+ {{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
{{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
- {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
- {{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+ {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
{{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
{{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
@@ -151,17 +121,17 @@ static const nsCharProps2 sCharProp2Values[98][128] = {
{{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
- {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2}},
+ {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{0,0},{2,0},{2,0},{0,0},{0,0},{0,1},{0,1},{0,1},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{0,0},{0,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{3,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
{{0,1},{0,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{2,1},{0,0},{0,0},{0,1},{0,1},{2,0},{2,0},{0,1},{0,1},{0,0},{3,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{3,1},{0,1},{0,1},{0,0}},
- {{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+ {{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
{{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0}},
{{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{2,0},{2,0},{2,0},{2,0},{2,0}},
{{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
{{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
- {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
- {{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
+ {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+ {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1}},
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
@@ -177,18 +147,16 @@ static const nsCharProps2 sCharProp2Values[98][128] = {
{{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{3,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{0,0},{0,0},{1,0},{1,0}},
{{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
- {{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0}},
- {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+ {{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
+ {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
{{2,0},{2,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
{{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
{{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
{{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
- {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+ {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
+ {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
{{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0}}
};
-
-
-
#define kFullWidthMaxPlane 0
#define kFullWidthIndexBits 10
#define kFullWidthCharBits 6
@@ -207,9 +175,6 @@ static const uint16_t sFullWidthValues[9][64] = {
{0x30bf,0x30c1,0x30c4,0x30c6,0x30c8,0x30ca,0x30cb,0x30cc,0x30cd,0x30ce,0x30cf,0x30d2,0x30d5,0x30d8,0x30db,0x30de,0x30df,0x30e0,0x30e1,0x30e2,0x30e4,0x30e6,0x30e8,0x30e9,0x30ea,0x30eb,0x30ec,0x30ed,0x30ef,0x30f3,0x3099,0x309a,0x3164,0x3131,0x3132,0x3133,0x3134,0x3135,0x3136,0x3137,0x3138,0x3139,0x313a,0x313b,0x313c,0x313d,0x313e,0x313f,0x3140,0x3141,0x3142,0x3143,0x3144,0x3145,0x3146,0x3147,0x3148,0x3149,0x314a,0x314b,0x314c,0x314d,0x314e,0x0000},
{0x0000,0x0000,0x314f,0x3150,0x3151,0x3152,0x3153,0x3154,0x0000,0x0000,0x3155,0x3156,0x3157,0x3158,0x3159,0x315a,0x0000,0x0000,0x315b,0x315c,0x315d,0x315e,0x315f,0x3160,0x0000,0x0000,0x3161,0x3162,0x3163,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x2502,0x2190,0x2191,0x2192,0x2193,0x25a0,0x25cb,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000}
};
-
-
-
#define kFullWidthInverseMaxPlane 0
#define kFullWidthInverseIndexBits 10
#define kFullWidthInverseCharBits 6
@@ -232,13 +197,6 @@ static const uint16_t sFullWidthInverseValues[13][64] = {
{0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006a,0x006b,0x006c,0x006d,0x006e,0x006f,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007a,0x007b,0x007c,0x007d,0x007e,0x2985,0x2986,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000},
{0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x00a2,0x00a3,0x00ac,0x00af,0x00a6,0x00a5,0x20a9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000}
};
-
-const uint32_t kTitleToUpper = 0x80000000;
-const uint32_t kUpperToLower = 0x40000000;
-const uint32_t kLowerToTitle = 0x20000000;
-const uint32_t kLowerToUpper = 0x10000000;
-const uint32_t kCaseMapCharMask = 0x001fffff;
-
/*
* * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
*/
diff --git a/intl/unicharutil/util/nsUnicodeScriptCodes.h b/intl/unicharutil/util/nsUnicodeScriptCodes.h
index 8cbc2b6a4a..b69148386e 100644
--- a/intl/unicharutil/util/nsUnicodeScriptCodes.h
+++ b/intl/unicharutil/util/nsUnicodeScriptCodes.h
@@ -11,13 +11,12 @@
*/
/*
- * Created on Wed Jun 22 10:09:48 2022 from UCD data files with version info:
+ * Created on Thu Jun 23 07:44:34 2022 from UCD data files with version info:
*
# Unicode Character Database
-# Date: 2016-06-20, 14:59:00 GMT [KW]
-# © 2016 Unicode®, Inc.
-# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# Date: 2017-06-18, 23:32:00 GMT [KW]
+# © 2017 Unicode®, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# For documentation, see the following:
@@ -25,44 +24,20 @@
# UAX #38, "Unicode Han Database (Unihan)"
# UAX #44, "Unicode Character Database."
#
-# The UAXes can be accessed at http://www.unicode.org/versions/Unicode9.0.0/
+# The UAXes can be accessed at http://www.unicode.org/versions/Unicode10.0.0/
This directory contains the final data files
-for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.
-
-# Scripts-9.0.0.txt
-# Date: 2016-06-01, 10:34:37 GMT
-
-# BidiMirroring-9.0.0.txt
-# Date: 2016-01-21, 22:00:00 GMT [KW, LI]
-
-# BidiBrackets-9.0.0.txt
-# Date: 2016-06-07, 22:30:00 GMT [AG, LI, KW]
-
-# HangulSyllableType-9.0.0.txt
-# Date: 2016-03-02, 18:55:01 GMT
-
-# LineBreak-9.0.0.txt
-# Date: 2016-05-26, 01:00:00 GMT [KW, LI]
-
-# EastAsianWidth-9.0.0.txt
-# Date: 2016-05-27, 17:00:00 GMT [KW, LI]
-
-# DerivedCoreProperties-9.0.0.txt
-# Date: 2016-06-01, 10:34:24 GMT
+for the Unicode Character Database, for Version 10.0.0 of the Unicode Standard.
# IdentifierStatus.txt
-# Date: 2016-06-16, 13:41:30 GMT
-
-# IdentifierType.txt
-# Date: 2016-06-16, 13:41:30 GMT
+# Date: 2017-04-08, 16:13:41 GMT
#
# Unihan_Variants.txt
-# Date: 2016-06-01 07:01:48 GMT [JHJ]
+# Date: 2017-05-14 07:01:48 GMT [JHJ]
-# VerticalOrientation-16.txt
-# Date: 2016-07-23, 01:00:00 GMT [EM, KI, LI]
+# VerticalOrientation-17.txt
+# Date: 2016-10-20, 07:00:00 GMT [EM, KI, LI]
*
* * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
@@ -74,7 +49,6 @@ for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.
#pragma pack(1)
-
struct nsCharProps2 {
// Currently only 4 bits are defined here, so 4 more could be added without
// affecting the storage requirements for this struct. Or we could pack two
@@ -83,13 +57,11 @@ struct nsCharProps2 {
unsigned char mIdType:2;
};
-
-
#pragma pack()
namespace mozilla {
namespace unicode {
-enum class Script {
+enum class Script : int16_t {
COMMON = 0,
INHERITED = 1,
ARABIC = 2,
diff --git a/ipc/chromium/src/base/condition_variable_posix.cc b/ipc/chromium/src/base/condition_variable_posix.cc
index 58565541e8..4a8024c2bc 100644
--- a/ipc/chromium/src/base/condition_variable_posix.cc
+++ b/ipc/chromium/src/base/condition_variable_posix.cc
@@ -19,8 +19,7 @@ using base::TimeDelta;
ConditionVariable::ConditionVariable(Lock* user_lock)
: user_mutex_(user_lock->lock_impl()->os_lock()) {
int rv = 0;
-#if !defined(OS_MACOSX) && \
- defined(HAVE_PTHREAD_COND_TIMEDWAIT_MONOTONIC)
+#if !defined(OS_MACOSX)
pthread_condattr_t attrs;
rv = pthread_condattr_init(&attrs);
DCHECK_EQ(0, rv);
diff --git a/media/libjpeg/1050342.diff b/media/libjpeg/1050342.diff
deleted file mode 100644
index a409ca2c47..0000000000
--- a/media/libjpeg/1050342.diff
+++ /dev/null
@@ -1,121 +0,0 @@
-Bug 1050342. Fix a case where the fast huffman decoder in libjpeg-turbo can produce different results depending on how data is fed to it.
-
-This change comes from the blink repo https://codereview.appspot.com/229430043/ and is unlikely to be accepted upstream into libjpeg-turbo.
-
-diff --git jdhuff.c jdhuff.c
---- jdhuff.c
-+++ jdhuff.c
-@@ -664,17 +664,17 @@ decode_mcu_fast (j_decompress_ptr cinfo,
- ASSIGN_STATE(state, entropy->saved);
-
- for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
- JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
- d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn];
- d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
- register int s, k, r, l;
-
-- HUFF_DECODE_FAST(s, l, dctbl);
-+ HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu);
- if (s) {
- FILL_BIT_BUFFER_FAST
- r = GET_BITS(s);
- s = HUFF_EXTEND(r, s);
- }
-
- if (entropy->dc_needed[blkn]) {
- int ci = cinfo->MCU_membership[blkn];
-@@ -682,17 +682,17 @@ decode_mcu_fast (j_decompress_ptr cinfo,
- state.last_dc_val[ci] = s;
- if (block)
- (*block)[0] = (JCOEF) s;
- }
-
- if (entropy->ac_needed[blkn] && block) {
-
- for (k = 1; k < DCTSIZE2; k++) {
-- HUFF_DECODE_FAST(s, l, actbl);
-+ HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
- r = s >> 4;
- s &= 15;
-
- if (s) {
- k += r;
- FILL_BIT_BUFFER_FAST
- r = GET_BITS(s);
- s = HUFF_EXTEND(r, s);
-@@ -701,33 +701,34 @@ decode_mcu_fast (j_decompress_ptr cinfo,
- if (r != 15) break;
- k += 15;
- }
- }
-
- } else {
-
- for (k = 1; k < DCTSIZE2; k++) {
-- HUFF_DECODE_FAST(s, l, actbl);
-+ HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
- r = s >> 4;
- s &= 15;
-
- if (s) {
- k += r;
- FILL_BIT_BUFFER_FAST
- DROP_BITS(s);
- } else {
- if (r != 15) break;
- k += 15;
- }
- }
- }
- }
-
- if (cinfo->unread_marker != 0) {
-+slow_decode_mcu:
- cinfo->unread_marker = 0;
- return FALSE;
- }
-
- br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
- br_state.next_input_byte = buffer;
- BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
- ASSIGN_STATE(entropy->saved, state);
-diff --git jdhuff.h jdhuff.h
---- jdhuff.h
-+++ jdhuff.h
-@@ -203,32 +203,34 @@ EXTERN(boolean) jpeg_fill_bit_buffer
- } else { \
- slowlabel: \
- if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
- { failaction; } \
- get_buffer = state.get_buffer; bits_left = state.bits_left; \
- } \
- }
-
--#define HUFF_DECODE_FAST(s,nb,htbl) \
-+#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \
- FILL_BIT_BUFFER_FAST; \
- s = PEEK_BITS(HUFF_LOOKAHEAD); \
- s = htbl->lookup[s]; \
- nb = s >> HUFF_LOOKAHEAD; \
- /* Pre-execute the common case of nb <= HUFF_LOOKAHEAD */ \
- DROP_BITS(nb); \
- s = s & ((1 << HUFF_LOOKAHEAD) - 1); \
- if (nb > HUFF_LOOKAHEAD) { \
- /* Equivalent of jpeg_huff_decode() */ \
- /* Don't use GET_BITS() here because we don't want to modify bits_left */ \
- s = (get_buffer >> bits_left) & ((1 << (nb)) - 1); \
- while (s > htbl->maxcode[nb]) { \
- s <<= 1; \
- s |= GET_BITS(1); \
- nb++; \
- } \
-- s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) & 0xFF ]; \
-+ if (nb > 16) \
-+ goto slowlabel; \
-+ s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \
- }
-
- /* Out-of-line case for Huffman code fetching */
- EXTERN(int) jpeg_huff_decode
- (bitread_working_state *state, register bit_buf_type get_buffer,
- register int bits_left, d_derived_tbl *htbl, int min_bits);
diff --git a/media/libjpeg/ChangeLog.md b/media/libjpeg/ChangeLog.md
new file mode 100644
index 0000000000..e6700c3c27
--- /dev/null
+++ b/media/libjpeg/ChangeLog.md
@@ -0,0 +1,1839 @@
+2.1.3
+=====
+
+### Significant changes relative to 2.1.2
+
+1. Fixed a regression introduced by 2.0 beta1[7] whereby cjpeg compressed PGM
+input files into full-color JPEG images unless the `-grayscale` option was
+used.
+
+2. cjpeg now automatically compresses GIF and 8-bit BMP input files into
+grayscale JPEG images if the input files contain only shades of gray.
+
+3. The build system now enables the intrinsics implementation of the AArch64
+(Arm 64-bit) Neon SIMD extensions by default when using GCC 12 or later.
+
+4. Fixed a segfault that occurred while decompressing a 4:2:0 JPEG image using
+the merged (non-fancy) upsampling algorithms (that is, with
+`cinfo.do_fancy_upsampling` set to `FALSE`) along with `jpeg_crop_scanline()`.
+Specifically, the segfault occurred if the number of bytes remaining in the
+output buffer was less than the number of bytes required to represent one
+uncropped scanline of the output image. For that reason, the issue could only
+be reproduced using the libjpeg API, not using djpeg.
+
+
+2.1.2
+=====
+
+### Significant changes relative to 2.1.1
+
+1. Fixed a regression introduced by 2.1 beta1[13] that caused the remaining
+GAS implementations of AArch64 (Arm 64-bit) Neon SIMD functions (which are used
+by default with GCC for performance reasons) to be placed in the `.rodata`
+section rather than in the `.text` section. This caused the GNU linker to
+automatically place the `.rodata` section in an executable segment, which
+prevented libjpeg-turbo from working properly with other linkers and also
+represented a potential security risk.
+
+2. Fixed an issue whereby the `tjTransform()` function incorrectly computed the
+MCU block size for 4:4:4 JPEG images with non-unary sampling factors and thus
+unduly rejected some cropping regions, even though those regions aligned with
+8x8 MCU block boundaries.
+
+3. Fixed a regression introduced by 2.1 beta1[13] that caused the build system
+to enable the Arm Neon SIMD extensions when targetting Armv6 and other legacy
+architectures that do not support Neon instructions.
+
+4. libjpeg-turbo now performs run-time detection of AltiVec instructions on
+FreeBSD/PowerPC systems if AltiVec instructions are not enabled at compile
+time. This allows both AltiVec-equipped and non-AltiVec-equipped CPUs to be
+supported using the same build of libjpeg-turbo.
+
+5. cjpeg now accepts a `-strict` argument similar to that of djpeg and
+jpegtran, which causes the compressor to abort if an LZW-compressed GIF input
+image contains incomplete or corrupt image data.
+
+
+2.1.1
+=====
+
+### Significant changes relative to 2.1.0
+
+1. Fixed a regression introduced in 2.1.0 that caused build failures with
+non-GCC-compatible compilers for Un*x/Arm platforms.
+
+2. Fixed a regression introduced by 2.1 beta1[13] that prevented the Arm 32-bit
+(AArch32) Neon SIMD extensions from building unless the C compiler flags
+included `-mfloat-abi=softfp` or `-mfloat-abi=hard`.
+
+3. Fixed an issue in the AArch32 Neon SIMD Huffman encoder whereby reliance on
+undefined C compiler behavior led to crashes ("SIGBUS: illegal alignment") on
+Android systems when running AArch32/Thumb builds of libjpeg-turbo built with
+recent versions of Clang.
+
+4. Added a command-line argument (`-copy icc`) to jpegtran that causes it to
+copy only the ICC profile markers from the source file and discard any other
+metadata.
+
+5. libjpeg-turbo should now build and run on CHERI-enabled architectures, which
+use capability pointers that are larger than the size of `size_t`.
+
+6. Fixed a regression (CVE-2021-37972) introduced by 2.1 beta1[5] that caused a
+segfault in the 64-bit SSE2 Huffman encoder when attempting to losslessly
+transform a specially-crafted malformed JPEG image.
+
+
+2.1.0
+=====
+
+### Significant changes relative to 2.1 beta1
+
+1. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress certain progressive JPEG images with one or more component planes of
+width 8 or less caused a buffer overrun.
+
+2. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress a specially-crafted malformed progressive JPEG image caused the
+block smoothing algorithm to read from uninitialized memory.
+
+3. Fixed an issue in the Arm Neon SIMD Huffman encoders that caused the
+encoders to generate incorrect results when using the Clang compiler with
+Visual Studio.
+
+4. Fixed a floating point exception (CVE-2021-20205) that occurred when
+attempting to compress a specially-crafted malformed GIF image with a specified
+image width of 0 using cjpeg.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 32 and non-zero
+successive approximation low bit positions would, under certain circumstances,
+result in an error ("Missing Huffman code table entry") and an invalid JPEG
+image.
+
+6. Introduced a new flag (`TJFLAG_LIMITSCANS` in the TurboJPEG C API and
+`TJ.FLAG_LIMIT_SCANS` in the TurboJPEG Java API) and a corresponding TJBench
+command-line argument (`-limitscans`) that causes the TurboJPEG decompression
+and transform functions/operations to return/throw an error if a progressive
+JPEG image contains an unreasonably large number of scans. This allows
+applications that use the TurboJPEG API to guard against an exploit of the
+progressive JPEG format described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+7. The PPM reader now throws an error, rather than segfaulting (due to a buffer
+overrun) or generating incorrect pixels, if an application attempts to use the
+`tjLoadImage()` function to load a 16-bit binary PPM file (a binary PPM file
+with a maximum value greater than 255) into a grayscale image buffer or to load
+a 16-bit binary PGM file into an RGB image buffer.
+
+8. Fixed an issue in the PPM reader that caused incorrect pixels to be
+generated when using the `tjLoadImage()` function to load a 16-bit binary PPM
+file into an extended RGB image buffer.
+
+9. Fixed an issue whereby, if a JPEG buffer was automatically re-allocated by
+one of the TurboJPEG compression or transform functions and an error
+subsequently occurred during compression or transformation, the JPEG buffer
+pointer passed by the application was not updated when the function returned.
+
+
+2.0.90 (2.1 beta1)
+==================
+
+### Significant changes relative to 2.0.6:
+
+1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
+support the x32 ABI on Linux, which allows for using x86-64 instructions with
+32-bit pointers. The x32 ABI is generally enabled by adding `-mx32` to the
+compiler flags.
+
+ Caveats:
+ - CMake 3.9.0 or later is required in order for the build system to
+automatically detect an x32 build.
+ - Java does not support the x32 ABI, and thus the TurboJPEG Java API will
+automatically be disabled with x32 builds.
+
+2. Added Loongson MMI SIMD implementations of the RGB-to-grayscale, 4:2:2 fancy
+chroma upsampling, 4:2:2 and 4:2:0 merged chroma upsampling/color conversion,
+and fast integer DCT/IDCT algorithms. Relative to libjpeg-turbo 2.0.x, this
+speeds up:
+
+ - the compression of RGB source images into grayscale JPEG images by
+approximately 20%
+ - the decompression of 4:2:2 JPEG images by approximately 40-60% when
+using fancy upsampling
+ - the decompression of 4:2:2 and 4:2:0 JPEG images by approximately
+15-20% when using merged upsampling
+ - the compression of RGB source images by approximately 30-45% when using
+the fast integer DCT
+ - the decompression of JPEG images into RGB destination images by
+approximately 2x when using the fast integer IDCT
+
+ The overall decompression speedup for RGB images is now approximately
+2.3-3.7x (compared to 2-3.5x with libjpeg-turbo 2.0.x.)
+
+3. 32-bit (Armv7 or Armv7s) iOS builds of libjpeg-turbo are no longer
+supported, and the libjpeg-turbo build system can no longer be used to package
+such builds. 32-bit iOS apps cannot run in iOS 11 and later, and the App Store
+no longer allows them.
+
+4. 32-bit (i386) OS X/macOS builds of libjpeg-turbo are no longer supported,
+and the libjpeg-turbo build system can no longer be used to package such
+builds. 32-bit Mac applications cannot run in macOS 10.15 "Catalina" and
+later, and the App Store no longer allows them.
+
+5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
+significantly optimized, resulting in a measured average overall compression
+speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
+and AMD CPUs, as well as a measured average overall compression speedup of
+0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
+implementation.
+
+6. The block smoothing algorithm that is applied by default when decompressing
+progressive Huffman-encoded JPEG images has been improved in the following
+ways:
+
+ - The algorithm is now more fault-tolerant. Previously, if a particular
+scan was incomplete, then the smoothing parameters for the incomplete scan
+would be applied to the entire output image, including the parts of the image
+that were generated by the prior (complete) scan. Visually, this had the
+effect of removing block smoothing from lower-frequency scans if they were
+followed by an incomplete higher-frequency scan. libjpeg-turbo now applies
+block smoothing parameters to each iMCU row based on which scan generated the
+pixels in that row, rather than always using the block smoothing parameters for
+the most recent scan.
+ - When applying block smoothing to DC scans, a Gaussian-like kernel with a
+5x5 window is used to reduce the "blocky" appearance.
+
+7. Added SIMD acceleration for progressive Huffman encoding on Arm platforms.
+This speeds up the compression of full-color progressive JPEGs by about 30-40%
+on average (relative to libjpeg-turbo 2.0.x) when using modern Arm CPUs.
+
+8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
+instructions, so that the Loongson MMI SIMD extensions can be included in any
+MIPS64 libjpeg-turbo build.
+
+9. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
+methods by which applications can guard against the exploits of the JPEG format
+described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+ - Both programs now accept a `-maxscans` argument, which can be used to
+limit the number of allowable scans in the input file.
+ - Both programs now accept a `-strict` argument, which can be used to
+treat all warnings as fatal.
+
+10. CMake package config files are now included for both the libjpeg and
+TurboJPEG API libraries. This facilitates using libjpeg-turbo with CMake's
+`find_package()` function. For example:
+
+ find_package(libjpeg-turbo CONFIG REQUIRED)
+
+ add_executable(libjpeg_program libjpeg_program.c)
+ target_link_libraries(libjpeg_program PUBLIC libjpeg-turbo::jpeg)
+
+ add_executable(libjpeg_program_static libjpeg_program.c)
+ target_link_libraries(libjpeg_program_static PUBLIC
+ libjpeg-turbo::jpeg-static)
+
+ add_executable(turbojpeg_program turbojpeg_program.c)
+ target_link_libraries(turbojpeg_program PUBLIC
+ libjpeg-turbo::turbojpeg)
+
+ add_executable(turbojpeg_program_static turbojpeg_program.c)
+ target_link_libraries(turbojpeg_program_static PUBLIC
+ libjpeg-turbo::turbojpeg-static)
+
+11. Since the Unisys LZW patent has long expired, cjpeg and djpeg can now
+read/write both LZW-compressed and uncompressed GIF files (feature ported from
+jpeg-6a and jpeg-9d.)
+
+12. jpegtran now includes the `-wipe` and `-drop` options from jpeg-9a and
+jpeg-9d, as well as the ability to expand the image size using the `-crop`
+option. Refer to jpegtran.1 or usage.txt for more details.
+
+13. Added a complete intrinsics implementation of the Arm Neon SIMD extensions,
+thus providing SIMD acceleration on Arm platforms for all of the algorithms
+that are SIMD-accelerated on x86 platforms. This new implementation is
+significantly faster in some cases than the old GAS implementation--
+depending on the algorithms used, the type of CPU core, and the compiler. GCC,
+as of this writing, does not provide a full or optimal set of Neon intrinsics,
+so for performance reasons, the default when building libjpeg-turbo with GCC is
+to continue using the GAS implementation of the following algorithms:
+
+ - 32-bit RGB-to-YCbCr color conversion
+ - 32-bit fast and accurate inverse DCT
+ - 64-bit RGB-to-YCbCr and YCbCr-to-RGB color conversion
+ - 64-bit accurate forward and inverse DCT
+ - 64-bit Huffman encoding
+
+ A new CMake variable (`NEON_INTRINSICS`) can be used to override this
+default.
+
+ Since the new intrinsics implementation includes SIMD acceleration
+for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
+been reverted.
+
+14. The Arm Neon SIMD extensions can now be built using Visual Studio.
+
+15. The build system can now be used to generate a universal x86-64 + Armv8
+libjpeg-turbo SDK package for both iOS and macOS.
+
+
+2.0.6
+=====
+
+### Significant changes relative to 2.0.5:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when using any of the YUV encoding/compression/decompression/decoding
+methods in the TurboJPEG Java API.
+
+2. Fixed or worked around multiple issues with `jpeg_skip_scanlines()`:
+
+ - Fixed segfaults or "Corrupt JPEG data: premature end of data segment"
+errors in `jpeg_skip_scanlines()` that occurred when decompressing 4:2:2 or
+4:2:0 JPEG images using merged (non-fancy) upsampling/color conversion (that
+is, when setting `cinfo.do_fancy_upsampling` to `FALSE`.) 2.0.0[6] was a
+similar fix, but it did not cover all cases.
+ - `jpeg_skip_scanlines()` now throws an error if two-pass color
+quantization is enabled. Two-pass color quantization never worked properly
+with `jpeg_skip_scanlines()`, and the issues could not readily be fixed.
+ - Fixed an issue whereby `jpeg_skip_scanlines()` always returned 0 when
+skipping past the end of an image.
+
+3. The Arm 64-bit (Armv8) Neon SIMD extensions can now be built using MinGW
+toolchains targetting Arm64 (AArch64) Windows binaries.
+
+4. Fixed unexpected visual artifacts that occurred when using
+`jpeg_crop_scanline()` and interblock smoothing while decompressing only the DC
+scan of a progressive JPEG image.
+
+5. Fixed an issue whereby libjpeg-turbo would not build if 12-bit-per-component
+JPEG support (`WITH_12BIT`) was enabled along with libjpeg v7 or libjpeg v8
+API/ABI emulation (`WITH_JPEG7` or `WITH_JPEG8`.)
+
+
+2.0.5
+=====
+
+### Significant changes relative to 2.0.4:
+
+1. Worked around issues in the MIPS DSPr2 SIMD extensions that caused failures
+in the libjpeg-turbo regression tests. Specifically, the
+`jsimd_h2v1_downsample_dspr2()` and `jsimd_h2v2_downsample_dspr2()` functions
+in the MIPS DSPr2 SIMD extensions are now disabled until/unless they can be
+fixed, and other functions that are incompatible with big endian MIPS CPUs are
+disabled when building libjpeg-turbo for such CPUs.
+
+2. Fixed an oversight in the `TJCompressor.compress(int)` method in the
+TurboJPEG Java API that caused an error ("java.lang.IllegalStateException: No
+source image is associated with this instance") when attempting to use that
+method to compress a YUV image.
+
+3. Fixed an issue (CVE-2020-13790) in the PPM reader that caused a buffer
+overrun in cjpeg, TJBench, or the `tjLoadImage()` function if one of the values
+in a binary PPM/PGM input file exceeded the maximum value defined in the file's
+header and that maximum value was less than 255. libjpeg-turbo 1.5.0 already
+included a similar fix for binary PPM/PGM files with maximum values greater
+than 255.
+
+4. The TurboJPEG API library's global error handler, which is used in functions
+such as `tjBufSize()` and `tjLoadImage()` that do not require a TurboJPEG
+instance handle, is now thread-safe on platforms that support thread-local
+storage.
+
+
+2.0.4
+=====
+
+### Significant changes relative to 2.0.3:
+
+1. Fixed a regression in the Windows packaging system (introduced by
+2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
+64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
+one of them could be uninstalled.
+
+2. Fixed a signed integer overflow and subsequent segfault that occurred when
+attempting to decompress images with more than 715827882 pixels using the
+64-bit C version of TJBench.
+
+3. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
+`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
+occurred when attempting to decompress grayscale JPEG images that were
+compressed with a sampling factor other than 1 (for instance, with
+`cjpeg -grayscale -sample 2x2`).
+
+4. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
+incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
+JPEG images. This was known to cause a buffer overflow when attempting to
+decompress some such images using `tjDecompressToYUV2()` or
+`tjDecompressToYUVPlanes()`.
+
+5. Fixed an issue (CVE-2020-17541), detected by ASan, whereby attempting to
+losslessly transform a specially-crafted malformed JPEG image containing an
+extremely-high-frequency coefficient block (junk image data that could never be
+generated by a legitimate JPEG compressor) could cause the Huffman encoder's
+local buffer to be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].) Given that
+the buffer overrun was fully contained within the stack and did not cause a
+segfault or other user-visible errant behavior, and given that the lossless
+transformer (unlike the decompressor) is not generally exposed to arbitrary
+data exploits, this issue did not likely pose a security risk.
+
+6. The Arm 64-bit (Armv8) Neon SIMD assembly code now stores constants in a
+separate read-only data section rather than in the text section, to support
+execute-only memory layouts.
+
+
+2.0.3
+=====
+
+### Significant changes relative to 2.0.2:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when passing invalid arguments to certain methods in the TurboJPEG
+Java API.
+
+2. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that was known to cause an illegal
+instruction exception, in rare cases, on CPUs that lack support for CPUID leaf
+07H (or on which the maximum CPUID leaf has been limited by way of a BIOS
+setting.)
+
+3. The 4:4:0 (h1v2) fancy (smooth) chroma upsampling algorithm in the
+decompressor now uses a similar bias pattern to that of the 4:2:2 (h2v1) fancy
+chroma upsampling algorithm, rounding up or down the upsampled result for
+alternate pixels rather than always rounding down. This ensures that,
+regardless of whether a 4:2:2 JPEG image is rotated or transposed prior to
+decompression (in the frequency domain) or after decompression (in the spatial
+domain), the final image will be similar.
+
+4. Fixed an integer overflow and subsequent segfault that occurred when
+attempting to compress or decompress images with more than 1 billion pixels
+using the TurboJPEG API.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 16 would result in an
+error ("Missing Huffman code table entry") and an invalid JPEG image.
+
+6. Fixed an issue whereby `tjDecodeYUV()` and `tjDecodeYUVPlanes()` would throw
+an error ("Invalid progressive parameters") or a warning ("Inconsistent
+progression sequence") if passed a TurboJPEG instance that was previously used
+to decompress a progressive JPEG image.
+
+
+2.0.2
+=====
+
+### Significant changes relative to 2.0.1:
+
+1. Fixed a regression introduced by 2.0.1[5] that prevented a runtime search
+path (rpath) from being embedded in the libjpeg-turbo shared libraries and
+executables for macOS and iOS. This caused a fatal error of the form
+"dyld: Library not loaded" when attempting to use one of the executables,
+unless `DYLD_LIBRARY_PATH` was explicitly set to the location of the
+libjpeg-turbo shared libraries.
+
+2. Fixed an integer overflow and subsequent segfault (CVE-2018-20330) that
+occurred when attempting to load a BMP file with more than 1 billion pixels
+using the `tjLoadImage()` function.
+
+3. Fixed a buffer overrun (CVE-2018-19664) that occurred when attempting to
+decompress a specially-crafted malformed JPEG image to a 256-color BMP using
+djpeg.
+
+4. Fixed a floating point exception that occurred when attempting to
+decompress a specially-crafted malformed JPEG image with a specified image
+width or height of 0 using the C version of TJBench.
+
+5. The TurboJPEG API will now decompress 4:4:4 JPEG images with 2x1, 1x2, 3x1,
+or 1x3 luminance and chrominance sampling factors. This is a non-standard way
+of specifying 1x subsampling (normally 4:4:4 JPEGs have 1x1 luminance and
+chrominance sampling factors), but the JPEG format and the libjpeg API both
+allow it.
+
+6. Fixed a regression introduced by 2.0 beta1[7] that caused djpeg to generate
+incorrect PPM images when used with the `-colors` option.
+
+7. Fixed an issue whereby a static build of libjpeg-turbo (a build in which
+`ENABLE_SHARED` is `0`) could not be installed using the Visual Studio IDE.
+
+8. Fixed a severe performance issue in the Loongson MMI SIMD extensions that
+occurred when compressing RGB images whose image rows were not 64-bit-aligned.
+
+
+2.0.1
+=====
+
+### Significant changes relative to 2.0.0:
+
+1. Fixed a regression introduced with the new CMake-based Un*x build system,
+whereby jconfig.h could cause compiler warnings of the form
+`"HAVE_*_H" redefined` if it was included by downstream Autotools-based
+projects that used `AC_CHECK_HEADERS()` to check for the existence of locale.h,
+stddef.h, or stdlib.h.
+
+2. The `jsimd_quantize_float_dspr2()` and `jsimd_convsamp_float_dspr2()`
+functions in the MIPS DSPr2 SIMD extensions are now disabled at compile time
+if the soft float ABI is enabled. Those functions use instructions that are
+incompatible with the soft float ABI.
+
+3. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that caused libjpeg-turbo to crash on
+Windows 7 if Service Pack 1 was not installed.
+
+4. Fixed out-of-bounds read in cjpeg that occurred when attempting to compress
+a specially-crafted malformed color-index (8-bit-per-sample) Targa file in
+which some of the samples (color indices) exceeded the bounds of the Targa
+file's color table.
+
+5. Fixed an issue whereby installing a fully static build of libjpeg-turbo
+(a build in which `CFLAGS` contains `-static` and `ENABLE_SHARED` is `0`) would
+fail with "No valid ELF RPATH or RUNPATH entry exists in the file."
+
+
+2.0.0
+=====
+
+### Significant changes relative to 2.0 beta1:
+
+1. The TurboJPEG API can now decompress CMYK JPEG images that have subsampled M
+and Y components (not to be confused with YCCK JPEG images, in which the C/M/Y
+components have been transformed into luma and chroma.) Previously, an error
+was generated ("Could not determine subsampling type for JPEG image") when such
+an image was passed to `tjDecompressHeader3()`, `tjTransform()`,
+`tjDecompressToYUVPlanes()`, `tjDecompressToYUV2()`, or the equivalent Java
+methods.
+
+2. Fixed an issue (CVE-2018-11813) whereby a specially-crafted malformed input
+file (specifically, a file with a valid Targa header but incomplete pixel data)
+would cause cjpeg to generate a JPEG file that was potentially thousands of
+times larger than the input file. The Targa reader in cjpeg was not properly
+detecting that the end of the input file had been reached prematurely, so after
+all valid pixels had been read from the input, the reader injected dummy pixels
+with values of 255 into the JPEG compressor until the number of pixels
+specified in the Targa header had been compressed. The Targa reader in cjpeg
+now behaves like the PPM reader and aborts compression if the end of the input
+file is reached prematurely. Because this issue only affected cjpeg and not
+the underlying library, and because it did not involve any out-of-bounds reads
+or other exploitable behaviors, it was not believed to represent a security
+threat.
+
+3. Fixed an issue whereby the `tjLoadImage()` and `tjSaveImage()` functions
+would produce a "Bogus message code" error message if the underlying bitmap and
+PPM readers/writers threw an error that was specific to the readers/writers
+(as opposed to a general libjpeg API error.)
+
+4. Fixed an issue (CVE-2018-1152) whereby a specially-crafted malformed BMP
+file, one in which the header specified an image width of 1073741824 pixels,
+would trigger a floating point exception (division by zero) in the
+`tjLoadImage()` function when attempting to load the BMP file into a
+4-component image buffer.
+
+5. Fixed an issue whereby certain combinations of calls to
+`jpeg_skip_scanlines()` and `jpeg_read_scanlines()` could trigger an infinite
+loop when decompressing progressive JPEG images that use vertical chroma
+subsampling (for instance, 4:2:0 or 4:4:0.)
+
+6. Fixed a segfault in `jpeg_skip_scanlines()` that occurred when decompressing
+a 4:2:2 or 4:2:0 JPEG image using the merged (non-fancy) upsampling algorithms
+(that is, when setting `cinfo.do_fancy_upsampling` to `FALSE`.)
+
+7. The new CMake-based build system will now disable the MIPS DSPr2 SIMD
+extensions if it detects that the compiler does not support DSPr2 instructions.
+
+8. Fixed out-of-bounds read in cjpeg (CVE-2018-14498) that occurred when
+attempting to compress a specially-crafted malformed color-index
+(8-bit-per-sample) BMP file in which some of the samples (color indices)
+exceeded the bounds of the BMP file's color table.
+
+9. Fixed a signed integer overflow in the progressive Huffman decoder, detected
+by the Clang and GCC undefined behavior sanitizers, that could be triggered by
+attempting to decompress a specially-crafted malformed JPEG image. This issue
+did not pose a security threat, but removing the warning made it easier to
+detect actual security issues, should they arise in the future.
+
+
+1.5.90 (2.0 beta1)
+==================
+
+### Significant changes relative to 1.5.3:
+
+1. Added AVX2 SIMD implementations of the colorspace conversion, chroma
+downsampling and upsampling, integer quantization and sample conversion, and
+accurate integer DCT/IDCT algorithms. When using the accurate integer DCT/IDCT
+algorithms on AVX2-equipped CPUs, the compression of RGB images is
+approximately 13-36% (avg. 22%) faster (relative to libjpeg-turbo 1.5.x) with
+64-bit code and 11-21% (avg. 17%) faster with 32-bit code, and the
+decompression of RGB images is approximately 9-35% (avg. 17%) faster with
+64-bit code and 7-17% (avg. 12%) faster with 32-bit code. (As tested on a
+3 GHz Intel Core i7. Actual mileage may vary.)
+
+2. Overhauled the build system to use CMake on all platforms, and removed the
+autotools-based build system. This decision resulted from extensive
+discussions within the libjpeg-turbo community. libjpeg-turbo traditionally
+used CMake only for Windows builds, but there was an increasing amount of
+demand to extend CMake support to other platforms. However, because of the
+unique nature of our code base (the need to support different assemblers on
+each platform, the need for Java support, etc.), providing dual build systems
+as other OSS imaging libraries do (including libpng and libtiff) would have
+created a maintenance burden. The use of CMake greatly simplifies some aspects
+of our build system, owing to CMake's built-in support for various assemblers,
+Java, and unit testing, as well as generally fewer quirks that have to be
+worked around in order to implement our packaging system. Eliminating
+autotools puts our project slightly at odds with the traditional practices of
+the OSS community, since most "system libraries" tend to be built with
+autotools, but it is believed that the benefits of this move outweigh the
+risks. In addition to providing a unified build environment, switching to
+CMake allows for the use of various build tools and IDEs that aren't supported
+under autotools, including XCode, Ninja, and Eclipse. It also eliminates the
+need to install autotools via MacPorts/Homebrew on OS X and allows
+libjpeg-turbo to be configured without the use of a terminal/command prompt.
+Extensive testing was conducted to ensure that all features provided by the
+autotools-based build system are provided by the new build system.
+
+3. The libjpeg API in this version of libjpeg-turbo now includes two additional
+functions, `jpeg_read_icc_profile()` and `jpeg_write_icc_profile()`, that can
+be used to extract ICC profile data from a JPEG file while decompressing or to
+embed ICC profile data in a JPEG file while compressing or transforming. This
+eliminates the need for downstream projects, such as color management libraries
+and browsers, to include their own glueware for accomplishing this.
+
+4. Improved error handling in the TurboJPEG API library:
+
+ - Introduced a new function (`tjGetErrorStr2()`) in the TurboJPEG C API
+that allows compression/decompression/transform error messages to be retrieved
+in a thread-safe manner. Retrieving error messages from global functions, such
+as `tjInitCompress()` or `tjBufSize()`, is still thread-unsafe, but since those
+functions will only throw errors if passed an invalid argument or if a memory
+allocation failure occurs, thread safety is not as much of a concern.
+ - Introduced a new function (`tjGetErrorCode()`) in the TurboJPEG C API
+and a new method (`TJException.getErrorCode()`) in the TurboJPEG Java API that
+can be used to determine the severity of the last
+compression/decompression/transform error. This allows applications to
+choose whether to ignore warnings (non-fatal errors) from the underlying
+libjpeg API or to treat them as fatal.
+ - Introduced a new flag (`TJFLAG_STOPONWARNING` in the TurboJPEG C API and
+`TJ.FLAG_STOPONWARNING` in the TurboJPEG Java API) that causes the library to
+immediately halt a compression/decompression/transform operation if it
+encounters a warning from the underlying libjpeg API (the default behavior is
+to allow the operation to complete unless a fatal error is encountered.)
+
+5. Introduced a new flag in the TurboJPEG C and Java APIs (`TJFLAG_PROGRESSIVE`
+and `TJ.FLAG_PROGRESSIVE`, respectively) that causes the library to use
+progressive entropy coding in JPEG images generated by compression and
+transform operations. Additionally, a new transform option
+(`TJXOPT_PROGRESSIVE` in the C API and `TJTransform.OPT_PROGRESSIVE` in the
+Java API) has been introduced, allowing progressive entropy coding to be
+enabled for selected transforms in a multi-transform operation.
+
+6. Introduced a new transform option in the TurboJPEG API (`TJXOPT_COPYNONE` in
+the C API and `TJTransform.OPT_COPYNONE` in the Java API) that allows the
+copying of markers (including EXIF and ICC profile data) to be disabled for a
+particular transform.
+
+7. Added two functions to the TurboJPEG C API (`tjLoadImage()` and
+`tjSaveImage()`) that can be used to load/save a BMP or PPM/PGM image to/from a
+memory buffer with a specified pixel format and layout. These functions
+replace the project-private (and slow) bmp API, which was previously used by
+TJBench, and they also provide a convenient way for first-time users of
+libjpeg-turbo to quickly develop a complete JPEG compression/decompression
+program.
+
+8. The TurboJPEG C API now includes a new convenience array (`tjAlphaOffset[]`)
+that contains the alpha component index for each pixel format (or -1 if the
+pixel format lacks an alpha component.) The TurboJPEG Java API now includes a
+new method (`TJ.getAlphaOffset()`) that returns the same value. In addition,
+the `tjRedOffset[]`, `tjGreenOffset[]`, and `tjBlueOffset[]` arrays-- and the
+corresponding `TJ.getRedOffset()`, `TJ.getGreenOffset()`, and
+`TJ.getBlueOffset()` methods-- now return -1 for `TJPF_GRAY`/`TJ.PF_GRAY`
+rather than 0. This allows programs to easily determine whether a pixel format
+has red, green, blue, and alpha components.
+
+9. Added a new example (tjexample.c) that demonstrates the basic usage of the
+TurboJPEG C API. This example mirrors the functionality of TJExample.java.
+Both files are now included in the libjpeg-turbo documentation.
+
+10. Fixed two signed integer overflows in the arithmetic decoder, detected by
+the Clang undefined behavior sanitizer, that could be triggered by attempting
+to decompress a specially-crafted malformed JPEG image. These issues did not
+pose a security threat, but removing the warnings makes it easier to detect
+actual security issues, should they arise in the future.
+
+11. Fixed a bug in the merged 4:2:0 upsampling/dithered RGB565 color conversion
+algorithm that caused incorrect dithering in the output image. This algorithm
+now produces bitwise-identical results to the unmerged algorithms.
+
+12. The SIMD function symbols for x86[-64]/ELF, MIPS/ELF, macOS/x86[-64] (if
+libjpeg-turbo is built with Yasm), and iOS/Arm[64] builds are now private.
+This prevents those symbols from being exposed in applications or shared
+libraries that link statically with libjpeg-turbo.
+
+13. Added Loongson MMI SIMD implementations of the RGB-to-YCbCr and
+YCbCr-to-RGB colorspace conversion, 4:2:0 chroma downsampling, 4:2:0 fancy
+chroma upsampling, integer quantization, and accurate integer DCT/IDCT
+algorithms. When using the accurate integer DCT/IDCT, this speeds up the
+compression of RGB images by approximately 70-100% and the decompression of RGB
+images by approximately 2-3.5x.
+
+14. Fixed a build error when building with older MinGW releases (regression
+caused by 1.5.1[7].)
+
+15. Added SIMD acceleration for progressive Huffman encoding on SSE2-capable
+x86 and x86-64 platforms. This speeds up the compression of full-color
+progressive JPEGs by about 85-90% on average (relative to libjpeg-turbo 1.5.x)
+when using modern Intel and AMD CPUs.
+
+
+1.5.3
+=====
+
+### Significant changes relative to 1.5.2:
+
+1. Fixed a NullPointerException in the TurboJPEG Java wrapper that occurred
+when using the YUVImage constructor that creates an instance backed by separate
+image planes and allocates memory for the image planes.
+
+2. Fixed an issue whereby the Java version of TJUnitTest would fail when
+testing BufferedImage encoding/decoding on big endian systems.
+
+3. Fixed a segfault in djpeg that would occur if an output format other than
+PPM/PGM was selected along with the `-crop` option. The `-crop` option now
+works with the GIF and Targa formats as well (unfortunately, it cannot be made
+to work with the BMP and RLE formats due to the fact that those output engines
+write scanlines in bottom-up order.) djpeg will now exit gracefully if an
+output format other than PPM/PGM, GIF, or Targa is selected along with the
+`-crop` option.
+
+4. Fixed an issue (CVE-2017-15232) whereby `jpeg_skip_scanlines()` would
+segfault if color quantization was enabled.
+
+5. TJBench (both C and Java versions) will now display usage information if any
+command-line argument is unrecognized. This prevents the program from silently
+ignoring typos.
+
+6. Fixed an access violation in tjbench.exe (Windows) that occurred when the
+program was used to decompress an existing JPEG image.
+
+7. Fixed an ArrayIndexOutOfBoundsException in the TJExample Java program that
+occurred when attempting to decompress a JPEG image that had been compressed
+with 4:1:1 chrominance subsampling.
+
+8. Fixed an issue whereby, when using `jpeg_skip_scanlines()` to skip to the
+end of a single-scan (non-progressive) image, subsequent calls to
+`jpeg_consume_input()` would return `JPEG_SUSPENDED` rather than
+`JPEG_REACHED_EOI`.
+
+9. `jpeg_crop_scanline()` now works correctly when decompressing grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with `cjpeg -grayscale -sample 2x2`).
+
+
+1.5.2
+=====
+
+### Significant changes relative to 1.5.1:
+
+1. Fixed a regression introduced by 1.5.1[7] that prevented libjpeg-turbo from
+building with Android NDK platforms prior to android-21 (5.0).
+
+2. Fixed a regression introduced by 1.5.1[1] that prevented the MIPS DSPR2 SIMD
+code in libjpeg-turbo from building.
+
+3. Fixed a regression introduced by 1.5 beta1[11] that prevented the Java
+version of TJBench from outputting any reference images (the `-nowrite` switch
+was accidentally enabled by default.)
+
+4. libjpeg-turbo should now build and run with full AltiVec SIMD acceleration
+on PowerPC-based AmigaOS 4 and OpenBSD systems.
+
+5. Fixed build and runtime errors on Windows that occurred when building
+libjpeg-turbo with libjpeg v7 API/ABI emulation and the in-memory
+source/destination managers. Due to an oversight, the `jpeg_skip_scanlines()`
+and `jpeg_crop_scanline()` functions were not being included in jpeg7.dll when
+libjpeg-turbo was built with `-DWITH_JPEG7=1` and `-DWITH_MEMSRCDST=1`.
+
+6. Fixed "Bogus virtual array access" error that occurred when using the
+lossless crop feature in jpegtran or the TurboJPEG API, if libjpeg-turbo was
+built with libjpeg v7 API/ABI emulation. This was apparently a long-standing
+bug that has existed since the introduction of libjpeg v7/v8 API/ABI emulation
+in libjpeg-turbo v1.1.
+
+7. The lossless transform features in jpegtran and the TurboJPEG API will now
+always attempt to adjust the EXIF image width and height tags if the image size
+changed as a result of the transform. This behavior has always existed when
+using libjpeg v8 API/ABI emulation. It was supposed to be available with
+libjpeg v7 API/ABI emulation as well but did not work properly due to a bug.
+Furthermore, there was never any good reason not to enable it with libjpeg v6b
+API/ABI emulation, since the behavior is entirely internal. Note that
+`-copy all` must be passed to jpegtran in order to transfer the EXIF tags from
+the source image to the destination image.
+
+8. Fixed several memory leaks in the TurboJPEG API library that could occur
+if the library was built with certain compilers and optimization levels
+(known to occur with GCC 4.x and clang with `-O1` and higher but not with
+GCC 5.x or 6.x) and one of the underlying libjpeg API functions threw an error
+after a TurboJPEG API function allocated a local buffer.
+
+9. The libjpeg-turbo memory manager will now honor the `max_memory_to_use`
+structure member in jpeg\_memory\_mgr, which can be set to the maximum amount
+of memory (in bytes) that libjpeg-turbo should use during decompression or
+multi-pass (including progressive) compression. This limit can also be set
+using the `JPEGMEM` environment variable or using the `-maxmemory` switch in
+cjpeg/djpeg/jpegtran (refer to the respective man pages for more details.)
+This has been a documented feature of libjpeg since v5, but the
+`malloc()`/`free()` implementation of the memory manager (jmemnobs.c) never
+implemented the feature. Restricting libjpeg-turbo's memory usage is useful
+for two reasons: it allows testers to more easily work around the 2 GB limit
+in libFuzzer, and it allows developers of security-sensitive applications to
+more easily defend against one of the progressive JPEG exploits (LJT-01-004)
+identified in
+[this report](http://www.libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+10. TJBench will now run each benchmark for 1 second prior to starting the
+timer, in order to improve the consistency of the results. Furthermore, the
+`-warmup` option is now used to specify the amount of warmup time rather than
+the number of warmup iterations.
+
+11. Fixed an error (`short jump is out of range`) that occurred when assembling
+the 32-bit x86 SIMD extensions with NASM versions prior to 2.04. This was a
+regression introduced by 1.5 beta1[12].
+
+
+1.5.1
+=====
+
+### Significant changes relative to 1.5.0:
+
+1. Previously, the undocumented `JSIMD_FORCE*` environment variables could be
+used to force-enable a particular SIMD instruction set if multiple instruction
+sets were available on a particular platform. On x86 platforms, where CPU
+feature detection is bulletproof and multiple SIMD instruction sets are
+available, it makes sense for those environment variables to allow forcing the
+use of an instruction set only if that instruction set is available. However,
+since the ARM implementations of libjpeg-turbo can only use one SIMD
+instruction set, and since their feature detection code is less bulletproof
+(parsing /proc/cpuinfo), it makes sense for the `JSIMD_FORCENEON` environment
+variable to bypass the feature detection code and really force the use of NEON
+instructions. A new environment variable (`JSIMD_FORCEDSPR2`) was introduced
+in the MIPS implementation for the same reasons, and the existing
+`JSIMD_FORCENONE` environment variable was extended to that implementation.
+These environment variables provide a workaround for those attempting to test
+ARM and MIPS builds of libjpeg-turbo in QEMU, which passes through
+/proc/cpuinfo from the host system.
+
+2. libjpeg-turbo previously assumed that AltiVec instructions were always
+available on PowerPC platforms, which led to "illegal instruction" errors when
+running on PowerPC chips that lack AltiVec support (such as the older 7xx/G3
+and newer e5500 series.) libjpeg-turbo now examines /proc/cpuinfo on
+Linux/Android systems and enables AltiVec instructions only if the CPU supports
+them. It also now provides two environment variables, `JSIMD_FORCEALTIVEC` and
+`JSIMD_FORCENONE`, to force-enable and force-disable AltiVec instructions in
+environments where /proc/cpuinfo is an unreliable means of CPU feature
+detection (such as when running in QEMU.) On OS X, libjpeg-turbo continues to
+assume that AltiVec support is always available, which means that libjpeg-turbo
+cannot be used with G3 Macs unless you set the environment variable
+`JSIMD_FORCENONE` to `1`.
+
+3. Fixed an issue whereby 64-bit ARM (AArch64) builds of libjpeg-turbo would
+crash when built with recent releases of the Clang/LLVM compiler. This was
+caused by an ABI conformance issue in some of libjpeg-turbo's 64-bit NEON SIMD
+routines. Those routines were incorrectly using 64-bit instructions to
+transfer a 32-bit JDIMENSION argument, whereas the ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined. The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+4. Fancy upsampling is now supported when decompressing JPEG images that use
+4:4:0 (h1v2) chroma subsampling. These images are generated when losslessly
+rotating or transposing JPEG images that use 4:2:2 (h2v1) chroma subsampling.
+The h1v2 fancy upsampling algorithm is not currently SIMD-accelerated.
+
+5. If merged upsampling isn't SIMD-accelerated but YCbCr-to-RGB conversion is,
+then libjpeg-turbo will now disable merged upsampling when decompressing YCbCr
+JPEG images into RGB or extended RGB output images. This significantly speeds
+up the decompression of 4:2:0 and 4:2:2 JPEGs on ARM platforms if fancy
+upsampling is not used (for example, if the `-nosmooth` option to djpeg is
+specified.)
+
+6. The TurboJPEG API will now decompress 4:2:2 and 4:4:0 JPEG images with
+2x2 luminance sampling factors and 2x1 or 1x2 chrominance sampling factors.
+This is a non-standard way of specifying 2x subsampling (normally 4:2:2 JPEGs
+have 2x1 luminance and 1x1 chrominance sampling factors, and 4:4:0 JPEGs have
+1x2 luminance and 1x1 chrominance sampling factors), but the JPEG format and
+the libjpeg API both allow it.
+
+7. Fixed an unsigned integer overflow in the libjpeg memory manager, detected
+by the Clang undefined behavior sanitizer, that could be triggered by
+attempting to decompress a specially-crafted malformed JPEG image. This issue
+affected only 32-bit code and did not pose a security threat, but removing the
+warning makes it easier to detect actual security issues, should they arise in
+the future.
+
+8. Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers when attempting to decompress
+specially-crafted malformed JPEG images. None of these issues posed a security
+threat, but removing the warnings makes it easier to detect actual security
+issues, should they arise in the future.
+
+9. Fixed an out-of-bounds array reference, introduced by 1.4.90[2] (partial
+image decompression) and detected by the Clang undefined behavior sanitizer,
+that could be triggered by a specially-crafted malformed JPEG image with more
+than four components. Because the out-of-bounds reference was still within the
+same structure, it was not known to pose a security threat, but removing the
+warning makes it easier to detect actual security issues, should they arise in
+the future.
+
+10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD
+code. Some of the routines were incorrectly reading and storing data below the
+stack pointer, which caused segfaults in certain applications under specific
+circumstances.
+
+
+1.5.0
+=====
+
+### Significant changes relative to 1.5 beta1:
+
+1. Fixed an issue whereby a malformed motion-JPEG frame could cause the "fast
+path" of libjpeg-turbo's Huffman decoder to read from uninitialized memory.
+
+2. Added libjpeg-turbo version and build information to the global string table
+of the libjpeg and TurboJPEG API libraries. This is a common practice in other
+infrastructure libraries, such as OpenSSL and libpng, because it makes it easy
+to examine an application binary and determine which version of the library the
+application was linked against.
+
+3. Fixed a couple of issues in the PPM reader that would cause buffer overruns
+in cjpeg if one of the values in a binary PPM/PGM input file exceeded the
+maximum value defined in the file's header and that maximum value was greater
+than 255. libjpeg-turbo 1.4.2 already included a similar fix for ASCII PPM/PGM
+files. Note that these issues were not security bugs, since they were confined
+to the cjpeg program and did not affect any of the libjpeg-turbo libraries.
+
+4. Fixed an issue whereby attempting to decompress a JPEG file with a corrupt
+header using the `tjDecompressToYUV2()` function would cause the function to
+abort without returning an error and, under certain circumstances, corrupt the
+stack. This only occurred if `tjDecompressToYUV2()` was called prior to
+calling `tjDecompressHeader3()`, or if the return value from
+`tjDecompressHeader3()` was ignored (both cases represent incorrect usage of
+the TurboJPEG API.)
+
+5. Fixed an issue in the ARM 32-bit SIMD-accelerated Huffman encoder that
+prevented the code from assembling properly with clang.
+
+6. The `jpeg_stdio_src()`, `jpeg_mem_src()`, `jpeg_stdio_dest()`, and
+`jpeg_mem_dest()` functions in the libjpeg API will now throw an error if a
+source/destination manager has already been assigned to the compress or
+decompress object by a different function or by the calling program. This
+prevents these functions from attempting to reuse a source/destination manager
+structure that was allocated elsewhere, because there is no way to ensure that
+it would be big enough to accommodate the new source/destination manager.
+
+
+1.4.90 (1.5 beta1)
+==================
+
+### Significant changes relative to 1.4.2:
+
+1. Added full SIMD acceleration for PowerPC platforms using AltiVec VMX
+(128-bit SIMD) instructions. Although the performance of libjpeg-turbo on
+PowerPC was already good, due to the increased number of registers available
+to the compiler vs. x86, it was still possible to speed up compression by about
+3-4x and decompression by about 2-2.5x (relative to libjpeg v6b) through the
+use of AltiVec instructions.
+
+2. Added two new libjpeg API functions (`jpeg_skip_scanlines()` and
+`jpeg_crop_scanline()`) that can be used to partially decode a JPEG image. See
+[libjpeg.txt](libjpeg.txt) for more details.
+
+3. The TJCompressor and TJDecompressor classes in the TurboJPEG Java API now
+implement the Closeable interface, so those classes can be used with a
+try-with-resources statement.
+
+4. The TurboJPEG Java classes now throw unchecked idiomatic exceptions
+(IllegalArgumentException, IllegalStateException) for unrecoverable errors
+caused by incorrect API usage, and those classes throw a new checked exception
+type (TJException) for errors that are passed through from the C library.
+
+5. Source buffers for the TurboJPEG C API functions, as well as the
+`jpeg_mem_src()` function in the libjpeg API, are now declared as const
+pointers. This facilitates passing read-only buffers to those functions and
+ensures the caller that the source buffer will not be modified. This should
+not create any backward API or ABI incompatibilities with prior libjpeg-turbo
+releases.
+
+6. The MIPS DSPr2 SIMD code can now be compiled to support either FR=0 or FR=1
+FPUs.
+
+7. Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers. Most of these issues affected only
+32-bit code, and none of them was known to pose a security threat, but removing
+the warnings makes it easier to detect actual security issues, should they
+arise in the future.
+
+8. Removed the unnecessary `.arch` directive from the ARM64 NEON SIMD code.
+This directive was preventing the code from assembling using the clang
+integrated assembler.
+
+9. Fixed a regression caused by 1.4.1[6] that prevented 32-bit and 64-bit
+libjpeg-turbo RPMs from being installed simultaneously on recent Red Hat/Fedora
+distributions. This was due to the addition of a macro in jconfig.h that
+allows the Huffman codec to determine the word size at compile time. Since
+that macro differs between 32-bit and 64-bit builds, this caused a conflict
+between the i386 and x86_64 RPMs (any differing files, other than executables,
+are not allowed when 32-bit and 64-bit RPMs are installed simultaneously.)
+Since the macro is used only internally, it has been moved into jconfigint.h.
+
+10. The x86-64 SIMD code can now be disabled at run time by setting the
+`JSIMD_FORCENONE` environment variable to `1` (the other SIMD implementations
+already had this capability.)
+
+11. Added a new command-line argument to TJBench (`-nowrite`) that prevents the
+benchmark from outputting any images. This removes any potential operating
+system overhead that might be caused by lazy writes to disk and thus improves
+the consistency of the performance measurements.
+
+12. Added SIMD acceleration for Huffman encoding on SSE2-capable x86 and x86-64
+platforms. This speeds up the compression of full-color JPEGs by about 10-15%
+on average (relative to libjpeg-turbo 1.4.x) when using modern Intel and AMD
+CPUs. Additionally, this works around an issue in the clang optimizer that
+prevents it (as of this writing) from achieving the same performance as GCC
+when compiling the C version of the Huffman encoder
+(<https://llvm.org/bugs/show_bug.cgi?id=16035>). For the purposes of
+benchmarking or regression testing, SIMD-accelerated Huffman encoding can be
+disabled by setting the `JSIMD_NOHUFFENC` environment variable to `1`.
+
+13. Added ARM 64-bit (ARMv8) NEON SIMD implementations of the commonly-used
+compression algorithms (including the accurate integer forward DCT and h2v2 &
+h2v1 downsampling algorithms, which are not accelerated in the 32-bit NEON
+implementation.) This speeds up the compression of full-color JPEGs by about
+75% on average on a Cavium ThunderX processor and by about 2-2.5x on average on
+Cortex-A53 and Cortex-A57 cores.
+
+14. Added SIMD acceleration for Huffman encoding on NEON-capable ARM 32-bit
+and 64-bit platforms.
+
+ For 32-bit code, this speeds up the compression of full-color JPEGs by
+about 30% on average on a typical iOS device (iPhone 4S, Cortex-A9) and by
+about 6-7% on average on a typical Android device (Nexus 5X, Cortex-A53 and
+Cortex-A57), relative to libjpeg-turbo 1.4.x. Note that the larger speedup
+under iOS is due to the fact that iOS builds use LLVM, which does not optimize
+the C Huffman encoder as well as GCC does.
+
+ For 64-bit code, NEON-accelerated Huffman encoding speeds up the
+compression of full-color JPEGs by about 40% on average on a typical iOS device
+(iPhone 5S, Apple A7) and by about 7-8% on average on a typical Android device
+(Nexus 5X, Cortex-A53 and Cortex-A57), in addition to the speedup described in
+[13] above.
+
+ For the purposes of benchmarking or regression testing, SIMD-accelerated
+Huffman encoding can be disabled by setting the `JSIMD_NOHUFFENC` environment
+variable to `1`.
+
+15. pkg-config (.pc) scripts are now included for both the libjpeg and
+TurboJPEG API libraries on Un*x systems. Note that if a project's build system
+relies on these scripts, then it will not be possible to build that project
+with libjpeg or with a prior version of libjpeg-turbo.
+
+16. Optimized the ARM 64-bit (ARMv8) NEON SIMD decompression routines to
+improve performance on CPUs with in-order pipelines. This speeds up the
+decompression of full-color JPEGs by nearly 2x on average on a Cavium ThunderX
+processor and by about 15% on average on a Cortex-A53 core.
+
+17. Fixed an issue in the accelerated Huffman decoder that could have caused
+the decoder to read past the end of the input buffer when a malformed,
+specially-crafted JPEG image was being decompressed. In prior versions of
+libjpeg-turbo, the accelerated Huffman decoder was invoked (in most cases) only
+if there were > 128 bytes of data in the input buffer. However, it is possible
+to construct a JPEG image in which a single Huffman block is over 430 bytes
+long, so this version of libjpeg-turbo activates the accelerated Huffman
+decoder only if there are > 512 bytes of data in the input buffer.
+
+18. Fixed a memory leak in tjunittest encountered when running the program
+with the `-yuv` option.
+
+
+1.4.2
+=====
+
+### Significant changes relative to 1.4.1:
+
+1. Fixed an issue whereby cjpeg would segfault if a Windows bitmap with a
+negative width or height was used as an input image (Windows bitmaps can have
+a negative height if they are stored in top-down order, but such files are
+rare and not supported by libjpeg-turbo.)
+
+2. Fixed an issue whereby, under certain circumstances, libjpeg-turbo would
+incorrectly encode certain JPEG images when quality=100 and the fast integer
+forward DCT were used. This was known to cause `make test` to fail when the
+library was built with `-march=haswell` on x86 systems.
+
+3. Fixed an issue whereby libjpeg-turbo would crash when built with the latest
+& greatest development version of the Clang/LLVM compiler. This was caused by
+an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
+routines. Those routines were incorrectly using a 64-bit `mov` instruction to
+transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined. The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+4. Fixed a bug in the MIPS DSPr2 4:2:0 "plain" (non-fancy and non-merged)
+upsampling routine that caused a buffer overflow (and subsequent segfault) when
+decompressing a 4:2:0 JPEG image whose scaled output width was less than 16
+pixels. The "plain" upsampling routines are normally only used when
+decompressing a non-YCbCr JPEG image, but they are also used when decompressing
+a JPEG image whose scaled output height is 1.
+
+5. Fixed various negative left shifts and other issues reported by the GCC and
+Clang undefined behavior sanitizers. None of these was known to pose a
+security threat, but removing the warnings makes it easier to detect actual
+security issues, should they arise in the future.
+
+
+1.4.1
+=====
+
+### Significant changes relative to 1.4.0:
+
+1. tjbench now properly handles CMYK/YCCK JPEG files. Passing an argument of
+`-cmyk` (instead of, for instance, `-rgb`) will cause tjbench to internally
+convert the source bitmap to CMYK prior to compression, to generate YCCK JPEG
+files, and to internally convert the decompressed CMYK pixels back to RGB after
+decompression (the latter is done automatically if a CMYK or YCCK JPEG is
+passed to tjbench as a source image.) The CMYK<->RGB conversion operation is
+not benchmarked. NOTE: The quick & dirty CMYK<->RGB conversions that tjbench
+uses are suitable for testing only. Proper conversion between CMYK and RGB
+requires a color management system.
+
+2. `make test` now performs additional bitwise regression tests using tjbench,
+mainly for the purpose of testing compression from/decompression to a subregion
+of a larger image buffer.
+
+3. `make test` no longer tests the regression of the floating point DCT/IDCT
+by default, since the results of those tests can vary if the algorithms in
+question are not implemented using SIMD instructions on a particular platform.
+See the comments in [Makefile.am](Makefile.am) for information on how to
+re-enable the tests and to specify an expected result for them based on the
+particulars of your platform.
+
+4. The NULL color conversion routines have been significantly optimized,
+which speeds up the compression of RGB and CMYK JPEGs by 5-20% when using
+64-bit code and 0-3% when using 32-bit code, and the decompression of those
+images by 10-30% when using 64-bit code and 3-12% when using 32-bit code.
+
+5. Fixed an "illegal instruction" error that occurred when djpeg from a
+SIMD-enabled libjpeg-turbo MIPS build was executed with the `-nosmooth` option
+on a MIPS machine that lacked DSPr2 support. The MIPS SIMD routines for h2v1
+and h2v2 merged upsampling were not properly checking for the existence of
+DSPr2.
+
+6. Performance has been improved significantly on 64-bit non-Linux and
+non-Windows platforms (generally 10-20% faster compression and 5-10% faster
+decompression.) Due to an oversight, the 64-bit version of the accelerated
+Huffman codec was not being compiled in when libjpeg-turbo was built on
+platforms other than Windows or Linux. Oops.
+
+7. Fixed an extremely rare bug in the Huffman encoder that caused 64-bit
+builds of libjpeg-turbo to incorrectly encode a few specific test images when
+quality=98, an optimized Huffman table, and the accurate integer forward DCT
+were used.
+
+8. The Windows (CMake) build system now supports building only static or only
+shared libraries. This is accomplished by adding either `-DENABLE_STATIC=0` or
+`-DENABLE_SHARED=0` to the CMake command line.
+
+9. TurboJPEG API functions will now return an error code if a warning is
+triggered in the underlying libjpeg API. For instance, if a JPEG file is
+corrupt, the TurboJPEG decompression functions will attempt to decompress
+as much of the image as possible, but those functions will now return -1 to
+indicate that the decompression was not entirely successful.
+
+10. Fixed a bug in the MIPS DSPr2 4:2:2 fancy upsampling routine that caused a
+buffer overflow (and subsequent segfault) when decompressing a 4:2:2 JPEG image
+in which the right-most MCU was 5 or 6 pixels wide.
+
+
+1.4.0
+=====
+
+### Significant changes relative to 1.4 beta1:
+
+1. Fixed a build issue on OS X PowerPC platforms (md5cmp failed to build
+because OS X does not provide the `le32toh()` and `htole32()` functions.)
+
+2. The non-SIMD RGB565 color conversion code did not work correctly on big
+endian machines. This has been fixed.
+
+3. Fixed an issue in `tjPlaneSizeYUV()` whereby it would erroneously return 1
+instead of -1 if `componentID` was > 0 and `subsamp` was `TJSAMP_GRAY`.
+
+3. Fixed an issue in `tjBufSizeYUV2()` whereby it would erroneously return 0
+instead of -1 if `width` was < 1.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM64 platforms (see 1.4 beta1[5].)
+
+6. The `close()` method in the TJCompressor and TJDecompressor Java classes is
+now idempotent. Previously, that method would call the native `tjDestroy()`
+function even if the TurboJPEG instance had already been destroyed. This
+caused an exception to be thrown during finalization, if the `close()` method
+had already been called. The exception was caught, but it was still an
+expensive operation.
+
+7. The TurboJPEG API previously generated an error (`Could not determine
+subsampling type for JPEG image`) when attempting to decompress grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with `cjpeg -grayscale -sample 2x2`). Subsampling technically has no meaning
+with grayscale JPEGs, and thus the horizontal and vertical sampling factors
+for such images are ignored by the decompressor. However, the TurboJPEG API
+was being too rigid and was expecting the sampling factors to be equal to 1
+before it treated the image as a grayscale JPEG.
+
+8. cjpeg, djpeg, and jpegtran now accept an argument of `-version`, which will
+print the library version and exit.
+
+9. Referring to 1.4 beta1[15], another extremely rare circumstance was
+discovered under which the Huffman encoder's local buffer can be overrun
+when a buffered destination manager is being used and an
+extremely-high-frequency block (basically junk image data) is being encoded.
+Even though the Huffman local buffer was increased from 128 bytes to 136 bytes
+to address the previous issue, the new issue caused even the larger buffer to
+be overrun. Further analysis reveals that, in the absolute worst case (such as
+setting alternating AC coefficients to 32767 and -32768 in the JPEG scanning
+order), the Huffman encoder can produce encoded blocks that approach double the
+size of the unencoded blocks. Thus, the Huffman local buffer was increased to
+256 bytes, which should prevent any such issue from re-occurring in the future.
+
+10. The new `tjPlaneSizeYUV()`, `tjPlaneWidth()`, and `tjPlaneHeight()`
+functions were not actually usable on any platform except OS X and Windows,
+because those functions were not included in the libturbojpeg mapfile. This
+has been fixed.
+
+11. Restored the `JPP()`, `JMETHOD()`, and `FAR` macros in the libjpeg-turbo
+header files. The `JPP()` and `JMETHOD()` macros were originally implemented
+in libjpeg as a way of supporting non-ANSI compilers that lacked support for
+prototype parameters. libjpeg-turbo has never supported such compilers, but
+some software packages still use the macros to define their own prototypes.
+Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
+have far symbols, but some software packages still use the `FAR` macro. A
+pretty good argument can be made that this is a bad practice on the part of the
+software in question, but since this affects more than one package, it's just
+easier to fix it here.
+
+12. Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
+for iOS, and included an ARMv8 architecture in all of the binaries installed by
+the "official" libjpeg-turbo SDK for OS X.
+
+
+1.3.90 (1.4 beta1)
+==================
+
+### Significant changes relative to 1.3.1:
+
+1. New features in the TurboJPEG API:
+
+ - YUV planar images can now be generated with an arbitrary line padding
+(previously only 4-byte padding, which was compatible with X Video, was
+supported.)
+ - The decompress-to-YUV function has been extended to support image
+scaling.
+ - JPEG images can now be compressed from YUV planar source images.
+ - YUV planar images can now be decoded into RGB or grayscale images.
+ - 4:1:1 subsampling is now supported. This is mainly included for
+compatibility, since 4:1:1 is not fully accelerated in libjpeg-turbo and has no
+significant advantages relative to 4:2:0.
+ - CMYK images are now supported. This feature allows CMYK source images
+to be compressed to YCCK JPEGs and YCCK or CMYK JPEGs to be decompressed to
+CMYK destination images. Conversion between CMYK/YCCK and RGB or YUV images is
+not supported. Such conversion requires a color management system and is thus
+out of scope for a codec library.
+ - The handling of YUV images in the Java API has been significantly
+refactored and should now be much more intuitive.
+ - The Java API now supports encoding a YUV image from an arbitrary
+position in a large image buffer.
+ - All of the YUV functions now have a corresponding function that operates
+on separate image planes instead of a unified image buffer. This allows for
+compressing/decoding from or decompressing/encoding to a subregion of a larger
+YUV image. It also allows for handling YUV formats that swap the order of the
+U and V planes.
+
+2. Added SIMD acceleration for DSPr2-capable MIPS platforms. This speeds up
+the compression of full-color JPEGs by 70-80% on such platforms and
+decompression by 25-35%.
+
+3. If an application attempts to decompress a Huffman-coded JPEG image whose
+header does not contain Huffman tables, libjpeg-turbo will now insert the
+default Huffman tables. In order to save space, many motion JPEG video frames
+are encoded without the default Huffman tables, so these frames can now be
+successfully decompressed by libjpeg-turbo without additional work on the part
+of the application. An application can still override the Huffman tables, for
+instance to re-use tables from a previous frame of the same video.
+
+4. The Mac packaging system now uses pkgbuild and productbuild rather than
+PackageMaker (which is obsolete and no longer supported.) This means that
+OS X 10.6 "Snow Leopard" or later must be used when packaging libjpeg-turbo,
+although the packages produced can be installed on OS X 10.5 "Leopard" or
+later. OS X 10.4 "Tiger" is no longer supported.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM platforms rather than a lookup table. This reduces the memory footprint
+by 64k, which may be important for some mobile applications. Out of four
+Android devices that were tested, two demonstrated a small overall performance
+loss (~3-4% on average) with ARMv6 code and a small gain (also ~3-4%) with
+ARMv7 code when enabling this new feature, but the other two devices
+demonstrated a significant overall performance gain with both ARMv6 and ARMv7
+code (~10-20%) when enabling the feature. Actual mileage may vary.
+
+6. Worked around an issue with Visual C++ 2010 and later that caused incorrect
+pixels to be generated when decompressing a JPEG image to a 256-color bitmap,
+if compiler optimization was enabled when libjpeg-turbo was built. This caused
+the regression tests to fail when doing a release build under Visual C++ 2010
+and later.
+
+7. Improved the accuracy and performance of the non-SIMD implementation of the
+floating point inverse DCT (using code borrowed from libjpeg v8a and later.)
+The accuracy of this implementation now matches the accuracy of the SSE/SSE2
+implementation. Note, however, that the floating point DCT/IDCT algorithms are
+mainly a legacy feature. They generally do not produce significantly better
+accuracy than the accurate integer DCT/IDCT algorithms, and they are quite a
+bit slower.
+
+8. Added a new output colorspace (`JCS_RGB565`) to the libjpeg API that allows
+for decompressing JPEG images into RGB565 (16-bit) pixels. If dithering is not
+used, then this code path is SIMD-accelerated on ARM platforms.
+
+9. Numerous obsolete features, such as support for non-ANSI compilers and
+support for the MS-DOS memory model, were removed from the libjpeg code,
+greatly improving its readability and making it easier to maintain and extend.
+
+10. Fixed a segfault that occurred when calling `output_message()` with
+`msg_code` set to `JMSG_COPYRIGHT`.
+
+11. Fixed an issue whereby wrjpgcom was allowing comments longer than 65k
+characters to be passed on the command line, which was causing it to generate
+incorrect JPEG files.
+
+12. Fixed a bug in the build system that was causing the Windows version of
+wrjpgcom to be built using the rdjpgcom source code.
+
+13. Restored 12-bit-per-component JPEG support. A 12-bit version of
+libjpeg-turbo can now be built by passing an argument of `--with-12bit` to
+configure (Unix) or `-DWITH_12BIT=1` to cmake (Windows.) 12-bit JPEG support
+is included only for convenience. Enabling this feature disables all of the
+performance features in libjpeg-turbo, as well as arithmetic coding and the
+TurboJPEG API. The resulting library still contains the other libjpeg-turbo
+features (such as the colorspace extensions), but in general, it performs no
+faster than libjpeg v6b.
+
+14. Added ARM 64-bit SIMD acceleration for the YCC-to-RGB color conversion
+and IDCT algorithms (both are used during JPEG decompression.) For unknown
+reasons (probably related to clang), this code cannot currently be compiled for
+iOS.
+
+15. Fixed an extremely rare bug (CVE-2014-9092) that could cause the Huffman
+encoder's local buffer to overrun when a very high-frequency MCU is compressed
+using quality 100 and no subsampling, and when the JPEG output buffer is being
+dynamically resized by the destination manager. This issue was so rare that,
+even with a test program specifically designed to make the bug occur (by
+injecting random high-frequency YUV data into the compressor), it was
+reproducible only once in about every 25 million iterations.
+
+16. Fixed an oversight in the TurboJPEG C wrapper: if any of the JPEG
+compression functions was called repeatedly with the same
+automatically-allocated destination buffer, then TurboJPEG would erroneously
+assume that the `jpegSize` parameter was equal to the size of the buffer, when
+in fact that parameter was probably equal to the size of the most recently
+compressed JPEG image. If the size of the previous JPEG image was not as large
+as the current JPEG image, then TurboJPEG would unnecessarily reallocate the
+destination buffer.
+
+
+1.3.1
+=====
+
+### Significant changes relative to 1.3.0:
+
+1. On Un*x systems, `make install` now installs the libjpeg-turbo libraries
+into /opt/libjpeg-turbo/lib32 by default on any 32-bit system, not just x86,
+and into /opt/libjpeg-turbo/lib64 by default on any 64-bit system, not just
+x86-64. You can override this by overriding either the `prefix` or `libdir`
+configure variables.
+
+2. The Windows installer now places a copy of the TurboJPEG DLLs in the same
+directory as the rest of the libjpeg-turbo binaries. This was mainly done
+to support TurboVNC 1.3, which bundles the DLLs in its Windows installation.
+When using a 32-bit version of CMake on 64-bit Windows, it is impossible to
+access the c:\WINDOWS\system32 directory, which made it impossible for the
+TurboVNC build scripts to bundle the 64-bit TurboJPEG DLL.
+
+3. Fixed a bug whereby attempting to encode a progressive JPEG with arithmetic
+entropy coding (by passing arguments of `-progressive -arithmetic` to cjpeg or
+jpegtran, for instance) would result in an error, `Requested feature was
+omitted at compile time`.
+
+4. Fixed a couple of issues (CVE-2013-6629 and CVE-2013-6630) whereby malformed
+JPEG images would cause libjpeg-turbo to use uninitialized memory during
+decompression.
+
+5. Fixed an error (`Buffer passed to JPEG library is too small`) that occurred
+when calling the TurboJPEG YUV encoding function with a very small (< 5x5)
+source image, and added a unit test to check for this error.
+
+6. The Java classes should now build properly under Visual Studio 2010 and
+later.
+
+7. Fixed an issue that prevented SRPMs generated using the in-tree packaging
+tools from being rebuilt on certain newer Linux distributions.
+
+8. Numerous minor fixes to eliminate compilation and build/packaging system
+warnings, fix cosmetic issues, improve documentation clarity, and other general
+source cleanup.
+
+
+1.3.0
+=====
+
+### Significant changes relative to 1.3 beta1:
+
+1. `make test` now works properly on FreeBSD, and it no longer requires the
+md5sum executable to be present on other Un*x platforms.
+
+2. Overhauled the packaging system:
+
+ - To avoid conflict with vendor-supplied libjpeg-turbo packages, the
+official RPMs and DEBs for libjpeg-turbo have been renamed to
+"libjpeg-turbo-official".
+ - The TurboJPEG libraries are now located under /opt/libjpeg-turbo in the
+official Linux and Mac packages, to avoid conflict with vendor-supplied
+packages and also to streamline the packaging system.
+ - Release packages are now created with the directory structure defined
+by the configure variables `prefix`, `bindir`, `libdir`, etc. (Un\*x) or by the
+`CMAKE_INSTALL_PREFIX` variable (Windows.) The exception is that the docs are
+always located under the system default documentation directory on Un\*x and
+Mac systems, and on Windows, the TurboJPEG DLL is always located in the Windows
+system directory.
+ - To avoid confusion, official libjpeg-turbo packages on Linux/Unix
+platforms (except for Mac) will always install the 32-bit libraries in
+/opt/libjpeg-turbo/lib32 and the 64-bit libraries in /opt/libjpeg-turbo/lib64.
+ - Fixed an issue whereby, in some cases, the libjpeg-turbo executables on
+Un*x systems were not properly linking with the shared libraries installed by
+the same package.
+ - Fixed an issue whereby building the "installer" target on Windows when
+`WITH_JAVA=1` would fail if the TurboJPEG JAR had not been previously built.
+ - Building the "install" target on Windows now installs files into the
+same places that the installer does.
+
+3. Fixed a Huffman encoder bug that prevented I/O suspension from working
+properly.
+
+
+1.2.90 (1.3 beta1)
+==================
+
+### Significant changes relative to 1.2.1:
+
+1. Added support for additional scaling factors (3/8, 5/8, 3/4, 7/8, 9/8, 5/4,
+11/8, 3/2, 13/8, 7/4, 15/8, and 2) when decompressing. Note that the IDCT will
+not be SIMD-accelerated when using any of these new scaling factors.
+
+2. The TurboJPEG dynamic library is now versioned. It was not strictly
+necessary to do so, because TurboJPEG uses versioned symbols, and if a function
+changes in an ABI-incompatible way, that function is renamed and a legacy
+function is provided to maintain backward compatibility. However, certain
+Linux distro maintainers have a policy against accepting any library that isn't
+versioned.
+
+3. Extended the TurboJPEG Java API so that it can be used to compress a JPEG
+image from and decompress a JPEG image to an arbitrary position in a large
+image buffer.
+
+4. The `tjDecompressToYUV()` function now supports the `TJFLAG_FASTDCT` flag.
+
+5. The 32-bit supplementary package for amd64 Debian systems now provides
+symlinks in /usr/lib/i386-linux-gnu for the TurboJPEG libraries in /usr/lib32.
+This allows those libraries to be used on MultiArch-compatible systems (such as
+Ubuntu 11 and later) without setting the linker path.
+
+6. The TurboJPEG Java wrapper should now find the JNI library on Mac systems
+without having to pass `-Djava.library.path=/usr/lib` to java.
+
+7. TJBench has been ported to Java to provide a convenient way of validating
+the performance of the TurboJPEG Java API. It can be run with
+`java -cp turbojpeg.jar TJBench`.
+
+8. cjpeg can now be used to generate JPEG files with the RGB colorspace
+(feature ported from jpeg-8d.)
+
+9. The width and height in the `-crop` argument passed to jpegtran can now be
+suffixed with `f` to indicate that, when the upper left corner of the cropping
+region is automatically moved to the nearest iMCU boundary, the bottom right
+corner should be moved by the same amount. In other words, this feature causes
+jpegtran to strictly honor the specified width/height rather than the specified
+bottom right corner (feature ported from jpeg-8d.)
+
+10. JPEG files using the RGB colorspace can now be decompressed into grayscale
+images (feature ported from jpeg-8d.)
+
+11. Fixed a regression caused by 1.2.1[7] whereby the build would fail with
+multiple "Mismatch in operand sizes" errors when attempting to build the x86
+SIMD code with NASM 0.98.
+
+12. The in-memory source/destination managers (`jpeg_mem_src()` and
+`jpeg_mem_dest()`) are now included by default when building libjpeg-turbo with
+libjpeg v6b or v7 emulation, so that programs can take advantage of these
+functions without requiring the use of the backward-incompatible libjpeg v8
+ABI. The "age number" of the libjpeg-turbo library on Un*x systems has been
+incremented by 1 to reflect this. You can disable this feature with a
+configure/CMake switch in order to retain strict API/ABI compatibility with the
+libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.) See
+[README.md](README.md) for more details.
+
+13. Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
+libjpeg-turbo binary package for OS X, so that those libraries can be used to
+build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
+
+
+1.2.1
+=====
+
+### Significant changes relative to 1.2.0:
+
+1. Creating or decoding a JPEG file that uses the RGB colorspace should now
+properly work when the input or output colorspace is one of the libjpeg-turbo
+colorspace extensions.
+
+2. When libjpeg-turbo was built without SIMD support and merged (non-fancy)
+upsampling was used along with an alpha-enabled colorspace during
+decompression, the unused byte of the decompressed pixels was not being set to
+0xFF. This has been fixed. TJUnitTest has also been extended to test for the
+correct behavior of the colorspace extensions when merged upsampling is used.
+
+3. Fixed a bug whereby the libjpeg-turbo SSE2 SIMD code would not preserve the
+upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
+calling conventions.
+
+4. Fixed a regression (CVE-2012-2806) caused by 1.2.0[6] whereby decompressing
+corrupt JPEG images (specifically, images in which the component count was
+erroneously set to a large value) would cause libjpeg-turbo to segfault.
+
+5. Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
+processors. The `MASKMOVDQU` instruction, which was used by the libjpeg-turbo
+SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
+it is painfully slow on Bobcat processors in particular. Eliminating the use
+of this instruction improved performance by an order of magnitude on Bobcat
+processors and by a small amount (typically 5%) on AMD desktop processors.
+
+6. Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
+platforms. This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
+platforms.
+
+7. Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms
+running the 32-bit SSE2 SIMD code in libjpeg-turbo, decompressing a 4:2:0 or
+4:2:2 JPEG image into a 32-bit (RGBX, BGRX, etc.) buffer without using fancy
+upsampling would produce several incorrect columns of pixels at the right-hand
+side of the output image if each row in the output image was not evenly
+divisible by 16 bytes.
+
+8. Fixed an issue whereby attempting to build the SIMD extensions with Xcode
+4.3 on OS X platforms would cause NASM to return numerous errors of the form
+"'%define' expects a macro identifier".
+
+9. Added flags to the TurboJPEG API that allow the caller to force the use of
+either the fast or the accurate DCT/IDCT algorithms in the underlying codec.
+
+
+1.2.0
+=====
+
+### Significant changes relative to 1.2 beta1:
+
+1. Fixed build issue with Yasm on Unix systems (the libjpeg-turbo build system
+was not adding the current directory to the assembler include path, so Yasm
+was not able to find jsimdcfg.inc.)
+
+2. Fixed out-of-bounds read in SSE2 SIMD code that occurred when decompressing
+a JPEG image to a bitmap buffer whose size was not a multiple of 16 bytes.
+This was more of an annoyance than an actual bug, since it did not cause any
+actual run-time problems, but the issue showed up when running libjpeg-turbo in
+valgrind. See <http://crbug.com/72399> for more information.
+
+3. Added a compile-time macro (`LIBJPEG_TURBO_VERSION`) that can be used to
+check the version of libjpeg-turbo against which an application was compiled.
+
+4. Added new RGBA/BGRA/ABGR/ARGB colorspace extension constants (libjpeg API)
+and pixel formats (TurboJPEG API), which allow applications to specify that,
+when decompressing to a 4-component RGB buffer, the unused byte should be set
+to 0xFF so that it can be interpreted as an opaque alpha channel.
+
+5. Fixed regression issue whereby DevIL failed to build against libjpeg-turbo
+because libjpeg-turbo's distributed version of jconfig.h contained an `INLINE`
+macro, which conflicted with a similar macro in DevIL. This macro is used only
+internally when building libjpeg-turbo, so it was moved into config.h.
+
+6. libjpeg-turbo will now correctly decompress erroneous CMYK/YCCK JPEGs whose
+K component is assigned a component ID of 1 instead of 4. Although these files
+are in violation of the spec, other JPEG implementations handle them
+correctly.
+
+7. Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
+the official libjpeg-turbo binary package for OS X, so that those libraries can
+be used to build both OS X and iOS applications.
+
+
+1.1.90 (1.2 beta1)
+==================
+
+### Significant changes relative to 1.1.1:
+
+1. Added a Java wrapper for the TurboJPEG API. See [java/README](java/README)
+for more details.
+
+2. The TurboJPEG API can now be used to scale down images during
+decompression.
+
+3. Added SIMD routines for RGB-to-grayscale color conversion, which
+significantly improves the performance of grayscale JPEG compression from an
+RGB source image.
+
+4. Improved the performance of the C color conversion routines, which are used
+on platforms for which SIMD acceleration is not available.
+
+5. Added a function to the TurboJPEG API that performs lossless transforms.
+This function is implemented using the same back end as jpegtran, but it
+performs transcoding entirely in memory and allows multiple transforms and/or
+crop operations to be batched together, so the source coefficients only need to
+be read once. This is useful when generating image tiles from a single source
+JPEG.
+
+6. Added tests for the new TurboJPEG scaled decompression and lossless
+transform features to tjbench (the TurboJPEG benchmark, formerly called
+"jpgtest".)
+
+7. Added support for 4:4:0 (transposed 4:2:2) subsampling in TurboJPEG, which
+was necessary in order for it to read 4:2:2 JPEG files that had been losslessly
+transposed or rotated 90 degrees.
+
+8. All legacy VirtualGL code has been re-factored, and this has allowed
+libjpeg-turbo, in its entirety, to be re-licensed under a BSD-style license.
+
+9. libjpeg-turbo can now be built with Yasm.
+
+10. Added SIMD acceleration for ARM Linux and iOS platforms that support
+NEON instructions.
+
+11. Refactored the TurboJPEG C API and documented it using Doxygen. The
+TurboJPEG 1.2 API uses pixel formats to define the size and component order of
+the uncompressed source/destination images, and it includes a more efficient
+version of `TJBUFSIZE()` that computes a worst-case JPEG size based on the
+level of chrominance subsampling. The refactored implementation of the
+TurboJPEG API now uses the libjpeg memory source and destination managers,
+which allows the TurboJPEG compressor to grow the JPEG buffer as necessary.
+
+12. Eliminated errors in the output of jpegtran on Windows that occurred when
+the application was invoked using I/O redirection
+(`jpegtran <input.jpg >output.jpg`.)
+
+13. The inclusion of libjpeg v7 and v8 emulation as well as arithmetic coding
+support in libjpeg-turbo v1.1.0 introduced several new error constants in
+jerror.h, and these were mistakenly enabled for all emulation modes, causing
+the error enum in libjpeg-turbo to sometimes have different values than the
+same enum in libjpeg. This represents an ABI incompatibility, and it caused
+problems with rare applications that took specific action based on a particular
+error value. The fix was to include the new error constants conditionally
+based on whether libjpeg v7 or v8 emulation was enabled.
+
+14. Fixed an issue whereby Windows applications that used libjpeg-turbo would
+fail to compile if the Windows system headers were included before jpeglib.h.
+This issue was caused by a conflict in the definition of the INT32 type.
+
+15. Fixed 32-bit supplementary package for amd64 Debian systems, which was
+broken by enhancements to the packaging system in 1.1.
+
+16. When decompressing a JPEG image using an output colorspace of
+`JCS_EXT_RGBX`, `JCS_EXT_BGRX`, `JCS_EXT_XBGR`, or `JCS_EXT_XRGB`,
+libjpeg-turbo will now set the unused byte to 0xFF, which allows applications
+to interpret that byte as an alpha channel (0xFF = opaque).
+
+
+1.1.1
+=====
+
+### Significant changes relative to 1.1.0:
+
+1. Fixed a 1-pixel error in row 0, column 21 of the luminance plane generated
+by `tjEncodeYUV()`.
+
+2. libjpeg-turbo's accelerated Huffman decoder previously ignored unexpected
+markers found in the middle of the JPEG data stream during decompression. It
+will now hand off decoding of a particular block to the unaccelerated Huffman
+decoder if an unexpected marker is found, so that the unaccelerated Huffman
+decoder can generate an appropriate warning.
+
+3. Older versions of MinGW64 prefixed symbol names with underscores by
+default, which differed from the behavior of 64-bit Visual C++. MinGW64 1.0
+has adopted the behavior of 64-bit Visual C++ as the default, so to accommodate
+this, the libjpeg-turbo SIMD function names are no longer prefixed with an
+underscore when building with MinGW64. This means that, when building
+libjpeg-turbo with older versions of MinGW64, you will now have to add
+`-fno-leading-underscore` to the `CFLAGS`.
+
+4. Fixed a regression bug in the NSIS script that caused the Windows installer
+build to fail when using the Visual Studio IDE.
+
+5. Fixed a bug in `jpeg_read_coefficients()` whereby it would not initialize
+`cinfo->image_width` and `cinfo->image_height` if libjpeg v7 or v8 emulation
+was enabled. This specifically caused the jpegoptim program to fail if it was
+linked against a version of libjpeg-turbo that was built with libjpeg v7 or v8
+emulation.
+
+6. Eliminated excessive I/O overhead that occurred when reading BMP files in
+cjpeg.
+
+7. Eliminated errors in the output of cjpeg on Windows that occurred when the
+application was invoked using I/O redirection (`cjpeg <inputfile >output.jpg`.)
+
+
+1.1.0
+=====
+
+### Significant changes relative to 1.1 beta1:
+
+1. The algorithm used by the SIMD quantization function cannot produce correct
+results when the JPEG quality is >= 98 and the fast integer forward DCT is
+used. Thus, the non-SIMD quantization function is now used for those cases,
+and libjpeg-turbo should now produce identical output to libjpeg v6b in all
+cases.
+
+2. Despite the above, the fast integer forward DCT still degrades somewhat for
+JPEG qualities greater than 95, so the TurboJPEG wrapper will now automatically
+use the accurate integer forward DCT when generating JPEG images of quality 96
+or greater. This reduces compression performance by as much as 15% for these
+high-quality images but is necessary to ensure that the images are perceptually
+lossless. It also ensures that the library can avoid the performance pitfall
+created by [1].
+
+3. Ported jpgtest.cxx to pure C to avoid the need for a C++ compiler.
+
+4. Fixed visual artifacts in grayscale JPEG compression caused by a typo in
+the RGB-to-luminance lookup tables.
+
+5. The Windows distribution packages now include the libjpeg run-time programs
+(cjpeg, etc.)
+
+6. All packages now include jpgtest.
+
+7. The TurboJPEG dynamic library now uses versioned symbols.
+
+8. Added two new TurboJPEG API functions, `tjEncodeYUV()` and
+`tjDecompressToYUV()`, to replace the somewhat hackish `TJ_YUV` flag.
+
+
+1.0.90 (1.1 beta1)
+==================
+
+### Significant changes relative to 1.0.1:
+
+1. Added emulation of the libjpeg v7 and v8 APIs and ABIs. See
+[README.md](README.md) for more details. This feature was sponsored by
+CamTrace SAS.
+
+2. Created a new CMake-based build system for the Visual C++ and MinGW builds.
+
+3. Grayscale bitmaps can now be compressed from/decompressed to using the
+TurboJPEG API.
+
+4. jpgtest can now be used to test decompression performance with existing
+JPEG images.
+
+5. If the default install prefix (/opt/libjpeg-turbo) is used, then
+`make install` now creates /opt/libjpeg-turbo/lib32 and
+/opt/libjpeg-turbo/lib64 sym links to duplicate the behavior of the binary
+packages.
+
+6. All symbols in the libjpeg-turbo dynamic library are now versioned, even
+when the library is built with libjpeg v6b emulation.
+
+7. Added arithmetic encoding and decoding support (can be disabled with
+configure or CMake options)
+
+8. Added a `TJ_YUV` flag to the TurboJPEG API, which causes both the compressor
+and decompressor to output planar YUV images.
+
+9. Added an extended version of `tjDecompressHeader()` to the TurboJPEG API,
+which allows the caller to determine the type of subsampling used in a JPEG
+image.
+
+10. Added further protections against invalid Huffman codes.
+
+
+1.0.1
+=====
+
+### Significant changes relative to 1.0.0:
+
+1. The Huffman decoder will now handle erroneous Huffman codes (for instance,
+from a corrupt JPEG image.) Previously, these would cause libjpeg-turbo to
+crash under certain circumstances.
+
+2. Fixed typo in SIMD dispatch routines that was causing 4:2:2 upsampling to
+be used instead of 4:2:0 when decompressing JPEG images using SSE2 code.
+
+3. The configure script will now automatically determine whether the
+`INCOMPLETE_TYPES_BROKEN` macro should be defined.
+
+
+1.0.0
+=====
+
+### Significant changes relative to 0.0.93:
+
+1. 2983700: Further FreeBSD build tweaks (no longer necessary to specify
+`--host` when configuring on a 64-bit system)
+
+2. Created symlinks in the Unix/Linux packages so that the TurboJPEG
+include file can always be found in /opt/libjpeg-turbo/include, the 32-bit
+static libraries can always be found in /opt/libjpeg-turbo/lib32, and the
+64-bit static libraries can always be found in /opt/libjpeg-turbo/lib64.
+
+3. The Unix/Linux distribution packages now include the libjpeg run-time
+programs (cjpeg, etc.) and man pages.
+
+4. Created a 32-bit supplementary package for amd64 Debian systems, which
+contains just the 32-bit libjpeg-turbo libraries.
+
+5. Moved the libraries from */lib32 to */lib in the i386 Debian package.
+
+6. Include distribution package for Cygwin
+
+7. No longer necessary to specify `--without-simd` on non-x86 architectures,
+and unit tests now work on those architectures.
+
+
+0.0.93
+======
+
+### Significant changes since 0.0.91:
+
+1. 2982659: Fixed x86-64 build on FreeBSD systems
+
+2. 2988188: Added support for Windows 64-bit systems
+
+
+0.0.91
+======
+
+### Significant changes relative to 0.0.90:
+
+1. Added documentation to .deb packages
+
+2. 2968313: Fixed data corruption issues when decompressing large JPEG images
+and/or using buffered I/O with the libjpeg-turbo decompressor
+
+
+0.0.90
+======
+
+Initial release
diff --git a/media/libjpeg/LICENSE.md b/media/libjpeg/LICENSE.md
index 4623e29425..d753e1d76a 100644
--- a/media/libjpeg/LICENSE.md
+++ b/media/libjpeg/LICENSE.md
@@ -9,12 +9,12 @@ libjpeg-turbo is covered by three compatible BSD-style open source licenses:
This license applies to the libjpeg API library and associated programs
(any code inherited from libjpeg, and any modifications to that code.)
-- The Modified (3-clause) BSD License, which is listed in
- [turbojpeg.c](turbojpeg.c)
+- The Modified (3-clause) BSD License, which is listed below
- This license covers the TurboJPEG API library and associated programs.
+ This license covers the TurboJPEG API library and associated programs, as
+ well as the build system.
-- The zlib License, which is listed in [simd/jsimdext.inc](simd/jsimdext.inc)
+- The [zlib License](https://opensource.org/licenses/Zlib)
This license is a subset of the other two, and it covers the libjpeg-turbo
SIMD extensions.
@@ -66,7 +66,7 @@ best of our understanding.
2. If your binary distribution includes or uses the TurboJPEG API, then
your product documentation must include the text of the Modified BSD
- License.
+ License (see below.)
**Origin**
- Clause 2 of the Modified BSD License
@@ -86,3 +86,47 @@ best of our understanding.
- IJG License
- Modified BSD License
- zlib License
+
+
+The Modified (3-clause) BSD License
+===================================
+
+Copyright (C)2009-2022 D. R. Commander. All Rights Reserved.<br>
+Copyright (C)2015 Viktor Szathmáry. All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+- Neither the name of the libjpeg-turbo Project nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+Why Three Licenses?
+===================
+
+The zlib License could have been used instead of the Modified (3-clause) BSD
+License, and since the IJG License effectively subsumes the distribution
+conditions of the zlib License, this would have effectively placed
+libjpeg-turbo binary distributions under the IJG License. However, the IJG
+License specifically refers to the Independent JPEG Group and does not extend
+attribution and endorsement protections to other entities. Thus, it was
+desirable to choose a license that granted us the same protections for new code
+that were granted to the IJG for code derived from their software.
diff --git a/media/libjpeg/MOZCHANGES b/media/libjpeg/MOZCHANGES
index 6e7824cdd7..4e65df2222 100644
--- a/media/libjpeg/MOZCHANGES
+++ b/media/libjpeg/MOZCHANGES
@@ -48,6 +48,34 @@ To upgrade to a new revision of libjpeg-turbo, do the following:
$ hg addremove
+== February 28, 2022 (libjpeg-turbo v2.1.3 c5f269eb9665435271c05fbcaf8721fa58e9eafa 2022-02-25) ==
+
+* Updated to v2.1.3 release.
+
+== September 9, 2021 (libjpeg-turbo v2.1.1 0a9b9721782d3a60a5c16c8c9a7abf3d4b1ecd42 2020-08-10) ==
+
+* Updated to v2.1.1 release.
+
+== November 19, 2020 (libjpeg-turbo v2.0.6 10ba6ed3365615ed5c2995fe2d240cb2d5000173 2020-11-16) ==
+
+* Updated to v2.0.6 release.
+
+== January 6, 2020 (libjpeg-turbo v2.0.4 166e34213e4f4e2363ce058a7bcc69fd03e38b76 2019-12-31) ==
+
+* Updated to v2.0.4 release.
+
+== September 5, 2019 (libjpeg-turbo v2.0.3 5db6a6819d0f904e0b58f34ae928fea234adb1a0 2019-09-04) ==
+
+* Updated to v2.0.3 release.
+
+== October 4, 2018 (libjpeg-turbo v2.0.0 574f3a772c96dc9db2c98ef24706feb3f6dbda9a 2018-06-27) ==
+
+* Updated to v2.0.0 release.
+
+== July 13, 2017 (libjpeg-turbo v1.5.2 e5c1613ccdfeffcd060fd94248b7c8ac7c0cfb0f 2017-08-09) ==
+
+* Updated to v1.5.2 release.
+
== September 22, 2016 (libjpeg-turbo v1.5.1 cb88e5da8003afcdc443b787fdcb77285e5a8a02 2016-09-20) ==
* Updated to v1.5.1 release.
diff --git a/media/libjpeg/README.ijg b/media/libjpeg/README.ijg
index 9c450ceb07..9453c19501 100644
--- a/media/libjpeg/README.ijg
+++ b/media/libjpeg/README.ijg
@@ -43,7 +43,7 @@ User documentation:
change.log Version-to-version change highlights.
Programmer and internal documentation:
libjpeg.txt How to use the JPEG library in your own programs.
- example.c Sample code for calling the JPEG library.
+ example.txt Sample code for calling the JPEG library.
structure.txt Overview of the JPEG library's internal structure.
coderules.txt Coding style rules --- please read if you contribute code.
@@ -128,7 +128,7 @@ with respect to this software, its quality, accuracy, merchantability, or
fitness for a particular purpose. This software is provided "AS IS", and you,
its user, assume the entire risk as to its quality and accuracy.
-This software is copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding.
+This software is copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
All Rights Reserved except as specified below.
Permission is hereby granted to use, copy, modify, and distribute this
@@ -159,25 +159,6 @@ commercial products, provided that all warranty or liability claims are
assumed by the product vendor.
-The Unix configuration script "configure" was produced with GNU Autoconf.
-It is copyright by the Free Software Foundation but is freely distributable.
-The same holds for its supporting scripts (config.guess, config.sub,
-ltmain.sh). Another support script, install-sh, is copyright by X Consortium
-but is also freely distributable.
-
-The IJG distribution formerly included code to read and write GIF files.
-To avoid entanglement with the Unisys LZW patent (now expired), GIF reading
-support has been removed altogether, and the GIF writer has been simplified
-to produce "uncompressed GIFs". This technique does not use the LZW
-algorithm; the resulting GIF files are larger than usual, but are readable
-by all standard GIF decoders.
-
-We are required to state that
- "The Graphics Interchange Format(c) is the Copyright property of
- CompuServe Incorporated. GIF(sm) is a Service Mark property of
- CompuServe Incorporated."
-
-
REFERENCES
==========
@@ -185,8 +166,8 @@ We recommend reading one or more of these references before trying to
understand the innards of the JPEG software.
The best short technical introduction to the JPEG compression algorithm is
- Wallace, Gregory K. "The JPEG Still Picture Compression Standard",
- Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
+ Wallace, Gregory K. "The JPEG Still Picture Compression Standard",
+ Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
(Adjacent articles in that issue discuss MPEG motion picture compression,
applications of JPEG, and related topics.) If you don't have the CACM issue
handy, a PDF file containing a revised version of Wallace's article is
@@ -220,21 +201,21 @@ Continuous-tone Still Images, Part 2: Compliance testing" and has document
numbers ISO/IEC IS 10918-2, ITU-T T.83.
The JPEG standard does not specify all details of an interchangeable file
-format. For the omitted details we follow the "JFIF" conventions, revision
-1.02. JFIF 1.02 has been adopted as an Ecma International Technical Report
-and thus received a formal publication status. It is available as a free
-download in PDF format from
-http://www.ecma-international.org/publications/techreports/E-TR-098.htm.
-A PostScript version of the JFIF document is available at
-http://www.ijg.org/files/jfif.ps.gz. There is also a plain text version at
-http://www.ijg.org/files/jfif.txt.gz, but it is missing the figures.
-
-The TIFF 6.0 file format specification can be obtained by FTP from
-ftp://ftp.sgi.com/graphics/tiff/TIFF6.ps.gz. The JPEG incorporation scheme
-found in the TIFF 6.0 spec of 3-June-92 has a number of serious problems.
-IJG does not recommend use of the TIFF 6.0 design (TIFF Compression tag 6).
-Instead, we recommend the JPEG design proposed by TIFF Technical Note #2
-(Compression tag 7). Copies of this Note can be obtained from
+format. For the omitted details, we follow the "JFIF" conventions, revision
+1.02. JFIF version 1 has been adopted as ISO/IEC 10918-5 (05/2013) and
+Recommendation ITU-T T.871 (05/2011): Information technology - Digital
+compression and coding of continuous-tone still images: JPEG File Interchange
+Format (JFIF). It is available as a free download in PDF file format from
+https://www.iso.org/standard/54989.html and http://www.itu.int/rec/T-REC-T.871.
+A PDF file of the older JFIF 1.02 specification is available at
+http://www.w3.org/Graphics/JPEG/jfif3.pdf.
+
+The TIFF 6.0 file format specification can be obtained from
+http://mirrors.ctan.org/graphics/tiff/TIFF6.ps.gz. The JPEG incorporation
+scheme found in the TIFF 6.0 spec of 3-June-92 has a number of serious
+problems. IJG does not recommend use of the TIFF 6.0 design (TIFF Compression
+tag 6). Instead, we recommend the JPEG design proposed by TIFF Technical Note
+#2 (Compression tag 7). Copies of this Note can be obtained from
http://www.ijg.org/files/. It is expected that the next revision
of the TIFF spec will replace the 6.0 JPEG design with the Note's design.
Although IJG's own code does not support TIFF/JPEG, the free libtiff library
@@ -249,28 +230,26 @@ The most recent released version can always be found there in
directory "files".
The JPEG FAQ (Frequently Asked Questions) article is a source of some
-general information about JPEG.
-It is available on the World Wide Web at http://www.faqs.org/faqs/jpeg-faq/
-and other news.answers archive sites, including the official news.answers
-archive at rtfm.mit.edu: ftp://rtfm.mit.edu/pub/usenet/news.answers/jpeg-faq/.
-If you don't have Web or FTP access, send e-mail to mail-server@rtfm.mit.edu
-with body
- send usenet/news.answers/jpeg-faq/part1
- send usenet/news.answers/jpeg-faq/part2
-
-
-FILE FORMAT WARS
-================
-
-The ISO/IEC JTC1/SC29/WG1 standards committee (also known as JPEG, together
-with ITU-T SG16) currently promotes different formats containing the name
-"JPEG" which are incompatible with original DCT-based JPEG. IJG therefore does
-not support these formats (see REFERENCES). Indeed, one of the original
-reasons for developing this free software was to help force convergence on
-common, interoperable format standards for JPEG files.
-Don't use an incompatible file format!
-(In any case, our decoder will remain capable of reading existing JPEG
-image files indefinitely.)
+general information about JPEG. It is available at
+http://www.faqs.org/faqs/jpeg-faq.
+
+
+FILE FORMAT COMPATIBILITY
+=========================
+
+This software implements ITU T.81 | ISO/IEC 10918 with some extensions from
+ITU T.871 | ISO/IEC 10918-5 (JPEG File Interchange Format-- see REFERENCES).
+Informally, the term "JPEG image" or "JPEG file" most often refers to JFIF or
+a subset thereof, but there are other formats containing the name "JPEG" that
+are incompatible with the DCT-based JPEG standard or with JFIF (for instance,
+JPEG 2000 and JPEG XR). This software therefore does not support these
+formats. Indeed, one of the original reasons for developing this free software
+was to help force convergence on a common, interoperable format standard for
+JPEG files.
+
+JFIF is a minimal or "low end" representation. TIFF/JPEG (TIFF revision 6.0 as
+modified by TIFF Technical Note #2) can be used for "high end" applications
+that need to record a lot of additional data about an image.
TO DO
diff --git a/media/libjpeg/README.md b/media/libjpeg/README.md
index ca8866e06c..01e391ea7c 100755..100644
--- a/media/libjpeg/README.md
+++ b/media/libjpeg/README.md
@@ -1,13 +1,14 @@
Background
==========
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
-x86, x86-64, ARM, and PowerPC systems. On such systems, libjpeg-turbo is
-generally 2-6x as fast as libjpeg, all else being equal. On other types of
-systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
-virtue of its highly-optimized Huffman coding routines. In many cases, the
-performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
+baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
+systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
+all else being equal. On other types of systems, libjpeg-turbo can still
+outperform libjpeg by a significant amount, by virtue of its highly-optimized
+Huffman coding routines. In many cases, the performance of libjpeg-turbo
+rivals that of proprietary high-speed JPEG codecs.
libjpeg-turbo implements both the traditional libjpeg API as well as the less
powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
@@ -42,21 +43,25 @@ Using libjpeg-turbo
libjpeg-turbo includes two APIs that can be used to compress and decompress
JPEG images:
-- **TurboJPEG API**
+- **TurboJPEG API**<br>
This API provides an easy-to-use interface for compressing and decompressing
JPEG images in memory. It also provides some functionality that would not be
straightforward to achieve using the underlying libjpeg API, such as
generating planar YUV images and performing multiple simultaneous lossless
transforms on an image. The Java interface for libjpeg-turbo is written on
- top of the TurboJPEG API.
+ top of the TurboJPEG API. The TurboJPEG API is recommended for first-time
+ users of libjpeg-turbo. Refer to [tjexample.c](tjexample.c) and
+ [TJExample.java](java/TJExample.java) for examples of its usage and to
+ <http://libjpeg-turbo.org/Documentation/Documentation> for API documentation.
-- **libjpeg API**
+- **libjpeg API**<br>
This is the de facto industry-standard API for compressing and decompressing
JPEG images. It is more difficult to use than the TurboJPEG API but also
more powerful. The libjpeg API implementation in libjpeg-turbo is both
API/ABI-compatible and mathematically compatible with libjpeg v6b. It can
also optionally be configured to be API/ABI-compatible with libjpeg v7 and v8
- (see below.)
+ (see below.) Refer to [cjpeg.c](cjpeg.c) and [djpeg.c](djpeg.c) for examples
+ of its usage and to [libjpeg.txt](libjpeg.txt) for API documentation.
There is no significant performance advantage to either API when both are used
to perform similar operations.
@@ -130,28 +135,27 @@ without recompiling. libjpeg-turbo does not claim to support all of the
libjpeg v7+ features, nor to produce identical output to libjpeg v7+ in all
cases (see below.)
-By passing an argument of `--with-jpeg7` or `--with-jpeg8` to `configure`, or
-an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you can build a
-version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so that
-programs that are built against libjpeg v7 or v8 can be run with libjpeg-turbo.
-The following section describes which libjpeg v7+ features are supported and
-which aren't.
+By passing an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you
+can build a version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so
+that programs that are built against libjpeg v7 or v8 can be run with
+libjpeg-turbo. The following section describes which libjpeg v7+ features are
+supported and which aren't.
### Support for libjpeg v7 and v8 Features
#### Fully supported
-- **libjpeg: IDCT scaling extensions in decompressor**
+- **libjpeg API: IDCT scaling extensions in decompressor**<br>
libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
and 1/2 are SIMD-accelerated.)
-- **libjpeg: Arithmetic coding**
+- **libjpeg API: Arithmetic coding**
-- **libjpeg: In-memory source and destination managers**
+- **libjpeg API: In-memory source and destination managers**<br>
See notes below.
-- **cjpeg: Separate quality settings for luminance and chrominance**
+- **cjpeg: Separate quality settings for luminance and chrominance**<br>
Note that the libpjeg v7+ API was extended to accommodate this feature only
for convenience purposes. It has always been possible to implement this
feature with libjpeg v6b (see rdswitch.c for an example.)
@@ -175,19 +179,19 @@ which aren't.
NOTE: As of this writing, extensive research has been conducted into the
usefulness of DCT scaling as a means of data reduction and SmartScale as a
-means of quality improvement. The reader is invited to peruse the research at
-<http://www.libjpeg-turbo.org/About/SmartScale> and draw his/her own conclusions,
+means of quality improvement. Readers are invited to peruse the research at
+<http://www.libjpeg-turbo.org/About/SmartScale> and draw their own conclusions,
but it is the general belief of our project that these features have not
demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
-- **libjpeg: DCT scaling in compressor**
+- **libjpeg API: DCT scaling in compressor**<br>
`cinfo.scale_num` and `cinfo.scale_denom` are silently ignored.
There is no technical reason why DCT scaling could not be supported when
emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
8/9 would be available, which is of limited usefulness.
-- **libjpeg: SmartScale**
+- **libjpeg API: SmartScale**<br>
`cinfo.block_size` is silently ignored.
SmartScale is an extension to the JPEG format that allows for DCT block
sizes other than 8x8. Providing support for this new format would be
@@ -200,15 +204,15 @@ demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
interest in providing this feature would be as a means of supporting
additional DCT scaling factors.
-- **libjpeg: Fancy downsampling in compressor**
+- **libjpeg API: Fancy downsampling in compressor**<br>
`cinfo.do_fancy_downsampling` is silently ignored.
This requires the DCT scaling feature, which is not supported.
-- **jpegtran: Scaling**
+- **jpegtran: Scaling**<br>
This requires both the DCT scaling and SmartScale features, which are not
supported.
-- **Lossless RGB JPEG files**
+- **Lossless RGB JPEG files**<br>
This requires the SmartScale feature, which is not supported.
### What About libjpeg v9?
@@ -226,7 +230,7 @@ generally accomplish anything that can't already be accomplished better with
existing, standard lossless formats. Therefore, at this time it is our belief
that there is not sufficient technical justification for software projects to
upgrade from libjpeg v8 to libjpeg v9, and thus there is not sufficient
-echnical justification for us to emulate the libjpeg v9 ABI.
+technical justification for us to emulate the libjpeg v9 ABI.
In-Memory Source/Destination Managers
-------------------------------------
@@ -242,15 +246,14 @@ don't, and it allows those functions to be provided in the "official"
libjpeg-turbo binaries.
Those who are concerned about maintaining strict conformance with the libjpeg
-v6b or v7 API can pass an argument of `--without-mem-srcdst` to `configure` or
-an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to building
-libjpeg-turbo. This will restore the pre-1.3 behavior, in which
+v6b or v7 API can pass an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to
+building libjpeg-turbo. This will restore the pre-1.3 behavior, in which
`jpeg_mem_src()` and `jpeg_mem_dest()` are only included when emulating the
libjpeg v8 API/ABI.
On Un*x systems, including the in-memory source/destination managers changes
-the dynamic library version from 62.0.0 to 62.1.0 if using libjpeg v6b API/ABI
-emulation and from 7.0.0 to 7.1.0 if using libjpeg v7 API/ABI emulation.
+the dynamic library version from 62.2.0 to 62.3.0 if using libjpeg v6b API/ABI
+emulation and from 7.2.0 to 7.3.0 if using libjpeg v7 API/ABI emulation.
Note that, on most Un*x systems, the dynamic linker will not look for a
function in a library until that function is actually used. Thus, if a program
@@ -284,12 +287,13 @@ following reasons:
(and slightly faster) floating point IDCT algorithm introduced in libjpeg
v8a as opposed to the algorithm used in libjpeg v6b. It should be noted,
however, that this algorithm basically brings the accuracy of the floating
- point IDCT in line with the accuracy of the slow integer IDCT. The floating
- point DCT/IDCT algorithms are mainly a legacy feature, and they do not
- produce significantly more accuracy than the slow integer algorithms (to put
- numbers on this, the typical difference in PNSR between the two algorithms
- is less than 0.10 dB, whereas changing the quality level by 1 in the upper
- range of the quality scale is typically more like a 1.0 dB difference.)
+ point IDCT in line with the accuracy of the accurate integer IDCT. The
+ floating point DCT/IDCT algorithms are mainly a legacy feature, and they do
+ not produce significantly more accuracy than the accurate integer algorithms
+ (to put numbers on this, the typical difference in PNSR between the two
+ algorithms is less than 0.10 dB, whereas changing the quality level by 1 in
+ the upper range of the quality scale is typically more like a 1.0 dB
+ difference.)
- If the floating point algorithms in libjpeg-turbo are not implemented using
SIMD instructions on a particular platform, then the accuracy of the
@@ -326,7 +330,7 @@ in a way that makes the rest of the libjpeg infrastructure happy, so it is
necessary to use the slow Huffman decoder when decompressing a JPEG image that
has restart markers. This can cause the decompression performance to drop by
as much as 20%, but the performance will still be much greater than that of
-libjpeg. Many consumer packages, such as PhotoShop, use restart markers when
+libjpeg. Many consumer packages, such as Photoshop, use restart markers when
generating JPEG images, so images generated by those programs will experience
this issue.
@@ -337,5 +341,17 @@ The algorithm used by the SIMD-accelerated quantization function cannot produce
correct results whenever the fast integer forward DCT is used along with a JPEG
quality of 98-100. Thus, libjpeg-turbo must use the non-SIMD quantization
function in those cases. This causes performance to drop by as much as 40%.
-It is therefore strongly advised that you use the slow integer forward DCT
+It is therefore strongly advised that you use the accurate integer forward DCT
whenever encoding images with a JPEG quality of 98 or higher.
+
+
+Memory Debugger Pitfalls
+========================
+
+Valgrind and Memory Sanitizer (MSan) can generate false positives
+(specifically, incorrect reports of uninitialized memory accesses) when used
+with libjpeg-turbo's SIMD extensions. It is generally recommended that the
+SIMD extensions be disabled, either by passing an argument of `-DWITH_SIMD=0`
+to `cmake` when configuring the build or by setting the environment variable
+`JSIMD_FORCENONE` to `1` at run time, when testing libjpeg-turbo with Valgrind,
+MSan, or other memory debuggers.
diff --git a/media/libjpeg/jaricom.c b/media/libjpeg/jaricom.c
index 3bb557f7a3..215640cc44 100644
--- a/media/libjpeg/jaricom.c
+++ b/media/libjpeg/jaricom.c
@@ -4,16 +4,16 @@
* This file was part of the Independent JPEG Group's software:
* Developed 1997-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2018, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
* This file contains probability estimation tables for common use in
* arithmetic entropy encoding and decoding routines.
*
- * This data represents Table D.2 in the JPEG spec (ISO/IEC IS 10918-1
- * and CCITT Recommendation ITU-T T.81) and Table 24 in the JBIG spec
- * (ISO/IEC IS 11544 and CCITT Recommendation ITU-T T.82).
+ * This data represents Table D.2 in
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994 and Table 24 in
+ * Recommendation ITU-T T.82 (1993) | ISO/IEC 11544:1993.
*/
#define JPEG_INTERNALS
@@ -29,9 +29,10 @@
* implementation (jbig_tab.c).
*/
-#define V(i,a,b,c,d) (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
+#define V(i, a, b, c, d) \
+ (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
-const JLONG jpeg_aritab[113+1] = {
+const JLONG jpeg_aritab[113 + 1] = {
/*
* Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
*/
diff --git a/media/libjpeg/jcapimin.c b/media/libjpeg/jcapimin.c
index 15674be54a..84e7ecc9a7 100644
--- a/media/libjpeg/jcapimin.c
+++ b/media/libjpeg/jcapimin.c
@@ -4,8 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1998, Thomas G. Lane.
* Modified 2003-2010 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -31,7 +31,7 @@
*/
GLOBAL(void)
-jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
+jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize)
{
int i;
@@ -41,7 +41,7 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
if (structsize != sizeof(struct jpeg_compress_struct))
ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
- (int) sizeof(struct jpeg_compress_struct), (int) structsize);
+ (int)sizeof(struct jpeg_compress_struct), (int)structsize);
/* For debugging purposes, we zero the whole master structure.
* But the application has already set the err pointer, and may have set
@@ -52,14 +52,14 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
{
struct jpeg_error_mgr *err = cinfo->err;
void *client_data = cinfo->client_data; /* ignore Purify complaint here */
- MEMZERO(cinfo, sizeof(struct jpeg_compress_struct));
+ memset(cinfo, 0, sizeof(struct jpeg_compress_struct));
cinfo->err = err;
cinfo->client_data = client_data;
}
cinfo->is_decompressor = FALSE;
/* Initialize a memory manager instance for this object */
- jinit_memory_mgr((j_common_ptr) cinfo);
+ jinit_memory_mgr((j_common_ptr)cinfo);
/* Zero out pointers to permanent structures. */
cinfo->progress = NULL;
@@ -83,7 +83,7 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
/* Must do it here for emit_dqt in case jpeg_write_tables is used */
cinfo->block_size = DCTSIZE;
cinfo->natural_order = jpeg_natural_order;
- cinfo->lim_Se = DCTSIZE2-1;
+ cinfo->lim_Se = DCTSIZE2 - 1;
#endif
cinfo->script_space = NULL;
@@ -100,9 +100,9 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
*/
GLOBAL(void)
-jpeg_destroy_compress (j_compress_ptr cinfo)
+jpeg_destroy_compress(j_compress_ptr cinfo)
{
- jpeg_destroy((j_common_ptr) cinfo); /* use common routine */
+ jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
}
@@ -112,9 +112,9 @@ jpeg_destroy_compress (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_abort_compress (j_compress_ptr cinfo)
+jpeg_abort_compress(j_compress_ptr cinfo)
{
- jpeg_abort((j_common_ptr) cinfo); /* use common routine */
+ jpeg_abort((j_common_ptr)cinfo); /* use common routine */
}
@@ -131,7 +131,7 @@ jpeg_abort_compress (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress)
+jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress)
{
int i;
JQUANT_TBL *qtbl;
@@ -159,7 +159,7 @@ jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress)
*/
GLOBAL(void)
-jpeg_finish_compress (j_compress_ptr cinfo)
+jpeg_finish_compress(j_compress_ptr cinfo)
{
JDIMENSION iMCU_row;
@@ -172,18 +172,18 @@ jpeg_finish_compress (j_compress_ptr cinfo)
} else if (cinfo->global_state != CSTATE_WRCOEFS)
ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
/* Perform any remaining passes */
- while (! cinfo->master->is_last_pass) {
+ while (!cinfo->master->is_last_pass) {
(*cinfo->master->prepare_for_pass) (cinfo);
for (iMCU_row = 0; iMCU_row < cinfo->total_iMCU_rows; iMCU_row++) {
if (cinfo->progress != NULL) {
- cinfo->progress->pass_counter = (long) iMCU_row;
- cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows;
- (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+ cinfo->progress->pass_counter = (long)iMCU_row;
+ cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
}
/* We bypass the main controller and invoke coef controller directly;
* all work is being done from the coefficient buffer.
*/
- if (! (*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE) NULL))
+ if (!(*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE)NULL))
ERREXIT(cinfo, JERR_CANT_SUSPEND);
}
(*cinfo->master->finish_pass) (cinfo);
@@ -192,7 +192,7 @@ jpeg_finish_compress (j_compress_ptr cinfo)
(*cinfo->marker->write_file_trailer) (cinfo);
(*cinfo->dest->term_destination) (cinfo);
/* We can use jpeg_abort to release memory and reset global_state */
- jpeg_abort((j_common_ptr) cinfo);
+ jpeg_abort((j_common_ptr)cinfo);
}
@@ -204,8 +204,8 @@ jpeg_finish_compress (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_write_marker (j_compress_ptr cinfo, int marker,
- const JOCTET *dataptr, unsigned int datalen)
+jpeg_write_marker(j_compress_ptr cinfo, int marker, const JOCTET *dataptr,
+ unsigned int datalen)
{
void (*write_marker_byte) (j_compress_ptr info, int val);
@@ -226,7 +226,7 @@ jpeg_write_marker (j_compress_ptr cinfo, int marker,
/* Same, but piecemeal. */
GLOBAL(void)
-jpeg_write_m_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
+jpeg_write_m_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
{
if (cinfo->next_scanline != 0 ||
(cinfo->global_state != CSTATE_SCANNING &&
@@ -238,7 +238,7 @@ jpeg_write_m_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
}
GLOBAL(void)
-jpeg_write_m_byte (j_compress_ptr cinfo, int val)
+jpeg_write_m_byte(j_compress_ptr cinfo, int val)
{
(*cinfo->marker->write_marker_byte) (cinfo, val);
}
@@ -266,13 +266,13 @@ jpeg_write_m_byte (j_compress_ptr cinfo, int val)
*/
GLOBAL(void)
-jpeg_write_tables (j_compress_ptr cinfo)
+jpeg_write_tables(j_compress_ptr cinfo)
{
if (cinfo->global_state != CSTATE_START)
ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
/* (Re)initialize error mgr and destination modules */
- (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+ (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
(*cinfo->dest->init_destination) (cinfo);
/* Initialize the marker writer ... bit of a crock to do it here. */
jinit_marker_writer(cinfo);
diff --git a/media/libjpeg/jcapistd.c b/media/libjpeg/jcapistd.c
index 5c6d0be255..aa2aad9f66 100644
--- a/media/libjpeg/jcapistd.c
+++ b/media/libjpeg/jcapistd.c
@@ -36,7 +36,7 @@
*/
GLOBAL(void)
-jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
+jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables)
{
if (cinfo->global_state != CSTATE_START)
ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -45,7 +45,7 @@ jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */
/* (Re)initialize error mgr and destination modules */
- (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+ (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
(*cinfo->dest->init_destination) (cinfo);
/* Perform master selection of active modules */
jinit_compress_master(cinfo);
@@ -75,8 +75,8 @@ jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
*/
GLOBAL(JDIMENSION)
-jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
- JDIMENSION num_lines)
+jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+ JDIMENSION num_lines)
{
JDIMENSION row_ctr, rows_left;
@@ -87,9 +87,9 @@ jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
/* Call progress monitor hook if present */
if (cinfo->progress != NULL) {
- cinfo->progress->pass_counter = (long) cinfo->next_scanline;
- cinfo->progress->pass_limit = (long) cinfo->image_height;
- (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+ cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->image_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
}
/* Give master control module another chance if this is first call to
@@ -118,8 +118,8 @@ jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
*/
GLOBAL(JDIMENSION)
-jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
- JDIMENSION num_lines)
+jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+ JDIMENSION num_lines)
{
JDIMENSION lines_per_iMCU_row;
@@ -132,9 +132,9 @@ jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
/* Call progress monitor hook if present */
if (cinfo->progress != NULL) {
- cinfo->progress->pass_counter = (long) cinfo->next_scanline;
- cinfo->progress->pass_limit = (long) cinfo->image_height;
- (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+ cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->image_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
}
/* Give master control module another chance if this is first call to
@@ -151,7 +151,7 @@ jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
ERREXIT(cinfo, JERR_BUFFER_SIZE);
/* Directly compress the row. */
- if (! (*cinfo->coef->compress_data) (cinfo, data)) {
+ if (!(*cinfo->coef->compress_data) (cinfo, data)) {
/* If compressor did not consume the whole row, suspend processing. */
return 0;
}
diff --git a/media/libjpeg/jcarith.c b/media/libjpeg/jcarith.c
index 6d3b8af5b4..b1720521bf 100644
--- a/media/libjpeg/jcarith.c
+++ b/media/libjpeg/jcarith.c
@@ -4,16 +4,19 @@
* This file was part of the Independent JPEG Group's software:
* Developed 1997-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2018, 2021-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
* This file contains portable arithmetic entropy encoding routines for JPEG
- * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ * (implementing Recommendation ITU-T T.81 | ISO/IEC 10918-1).
*
* Both sequential and progressive modes are supported in this single module.
*
* Suspension is not currently supported in this module.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
@@ -63,8 +66,8 @@ typedef arith_entropy_encoder *arith_entropy_ptr;
* in the lower bits (mask 0x7F).
*/
-#define DC_STAT_BINS 64
-#define AC_STAT_BINS 256
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
/* NOTE: Uncomment the following #define if you want to use the
* given formula for calculating the AC conditioning parameter Kx
@@ -105,25 +108,25 @@ typedef arith_entropy_encoder *arith_entropy_ptr;
#ifdef RIGHT_SHIFT_IS_UNSIGNED
#define ISHIFT_TEMPS int ishift_temp;
-#define IRIGHT_SHIFT(x,shft) \
- ((ishift_temp = (x)) < 0 ? \
- (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
- (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+ ((ishift_temp = (x)) < 0 ? \
+ (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+ (ishift_temp >> (shft)))
#else
#define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft) ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
#endif
LOCAL(void)
-emit_byte (int val, j_compress_ptr cinfo)
+emit_byte(int val, j_compress_ptr cinfo)
/* Write next output byte; we do not support suspension in this module. */
{
struct jpeg_destination_mgr *dest = cinfo->dest;
- *dest->next_output_byte++ = (JOCTET) val;
+ *dest->next_output_byte++ = (JOCTET)val;
if (--dest->free_in_buffer == 0)
- if (! (*dest->empty_output_buffer) (cinfo))
+ if (!(*dest->empty_output_buffer) (cinfo))
ERREXIT(cinfo, JERR_CANT_SUSPEND);
}
@@ -133,22 +136,22 @@ emit_byte (int val, j_compress_ptr cinfo)
*/
METHODDEF(void)
-finish_pass (j_compress_ptr cinfo)
+finish_pass(j_compress_ptr cinfo)
{
- arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
JLONG temp;
/* Section D.1.8: Termination of encoding */
/* Find the e->c in the coding interval with the largest
* number of trailing zero bits */
- if ((temp = (e->a - 1 + e->c) & 0xFFFF0000L) < e->c)
+ if ((temp = (e->a - 1 + e->c) & 0xFFFF0000UL) < e->c)
e->c = temp + 0x8000L;
else
e->c = temp;
/* Send remaining bytes to output */
e->c <<= e->ct;
- if (e->c & 0xF8000000L) {
+ if (e->c & 0xF8000000UL) {
/* One final overflow has to be handled */
if (e->buffer >= 0) {
if (e->zc)
@@ -219,9 +222,9 @@ finish_pass (j_compress_ptr cinfo)
*/
LOCAL(void)
-arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
+arith_encode(j_compress_ptr cinfo, unsigned char *st, int val)
{
- register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+ register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
register unsigned char nl, nm;
register JLONG qe, temp;
register int sv;
@@ -231,8 +234,8 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
*/
sv = *st;
qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */
- nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */
- nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
+ nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */
+ nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
/* Encode & estimation procedures per sections D.1.4 & D.1.5 */
e->a -= qe;
@@ -319,9 +322,9 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
*/
LOCAL(void)
-emit_restart (j_compress_ptr cinfo, int restart_num)
+emit_restart(j_compress_ptr cinfo, int restart_num)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
int ci;
jpeg_component_info *compptr;
@@ -335,14 +338,14 @@ emit_restart (j_compress_ptr cinfo, int restart_num)
compptr = cinfo->cur_comp_info[ci];
/* DC needs no table for refinement scan */
if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
- MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+ memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
/* Reset DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
}
/* AC needs no table when not present */
if (cinfo->progressive_mode == 0 || cinfo->Se) {
- MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+ memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
}
}
@@ -362,9 +365,9 @@ emit_restart (j_compress_ptr cinfo, int restart_num)
*/
METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
JBLOCKROW block;
unsigned char *st;
int blkn, ci, tbl;
@@ -391,7 +394,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* Compute the DC value after the required point transform by Al.
* This is simply an arithmetic right shift.
*/
- m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al);
+ m = IRIGHT_SHIFT((int)((*block)[0]), cinfo->Al);
/* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
@@ -432,9 +435,9 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
}
arith_encode(cinfo, st, 0);
/* Section F.1.4.4.1.2: Establish dc_context conditioning category */
- if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+ if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
entropy->dc_context[ci] = 0; /* zero diff category */
- else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+ else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
entropy->dc_context[ci] += 8; /* large diff category */
/* Figure F.9: Encoding the magnitude bit pattern of v */
st += 14;
@@ -453,9 +456,9 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
JBLOCKROW block;
unsigned char *st;
int tbl, k, ke;
@@ -510,7 +513,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
break;
}
}
- arith_encode(cinfo, st + 1, 0); st += 3; k++;
+ arith_encode(cinfo, st + 1, 0); st += 3; k++;
}
st += 2;
/* Figure F.8: Encoding the magnitude category of v */
@@ -552,9 +555,9 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
unsigned char *st;
int Al, blkn;
@@ -587,9 +590,9 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
JBLOCKROW block;
unsigned char *st;
int tbl, k, ke, kex;
@@ -662,7 +665,7 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
break;
}
}
- arith_encode(cinfo, st + 1, 0); st += 3; k++;
+ arith_encode(cinfo, st + 1, 0); st += 3; k++;
}
}
/* Encode EOB decision only if k <= cinfo->Se */
@@ -680,9 +683,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
jpeg_component_info *compptr;
JBLOCKROW block;
unsigned char *st;
@@ -747,9 +750,9 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
}
arith_encode(cinfo, st, 0);
/* Section F.1.4.4.1.2: Establish dc_context conditioning category */
- if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+ if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
entropy->dc_context[ci] = 0; /* zero diff category */
- else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+ else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
entropy->dc_context[ci] += 8; /* large diff category */
/* Figure F.9: Encoding the magnitude bit pattern of v */
st += 14;
@@ -770,7 +773,7 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
st = entropy->ac_stats[tbl] + 3 * (k - 1);
arith_encode(cinfo, st, 0); /* EOB decision */
while ((v = (*block)[jpeg_natural_order[k]]) == 0) {
- arith_encode(cinfo, st + 1, 0); st += 3; k++;
+ arith_encode(cinfo, st + 1, 0); st += 3; k++;
}
arith_encode(cinfo, st + 1, 1);
/* Figure F.6: Encoding nonzero value v */
@@ -822,9 +825,9 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(void)
-start_pass (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass(j_compress_ptr cinfo, boolean gather_statistics)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
int ci, tbl;
jpeg_component_info *compptr;
@@ -833,7 +836,7 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
* We are fully adaptive here and need no extra
* statistics gathering pass!
*/
- ERREXIT(cinfo, JERR_NOT_COMPILED);
+ ERREXIT(cinfo, JERR_NOTIMPL);
/* We assume jcmaster.c already validated the progressive scan parameters. */
@@ -862,9 +865,9 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
if (entropy->dc_stats[tbl] == NULL)
- entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
- MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+ entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+ memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
/* Initialize DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
@@ -875,13 +878,14 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
if (entropy->ac_stats[tbl] == NULL)
- entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
- MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+ entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+ memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
#ifdef CALCULATE_SPECTRAL_CONDITIONING
if (cinfo->progressive_mode)
/* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
- cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+ cinfo->arith_ac_K[tbl] = cinfo->Ss +
+ ((8 + cinfo->Se - cinfo->Ss) >> 4);
#endif
}
}
@@ -905,15 +909,15 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
*/
GLOBAL(void)
-jinit_arith_encoder (j_compress_ptr cinfo)
+jinit_arith_encoder(j_compress_ptr cinfo)
{
arith_entropy_ptr entropy;
int i;
entropy = (arith_entropy_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(arith_entropy_encoder));
- cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+ cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
entropy->pub.start_pass = start_pass;
entropy->pub.finish_pass = finish_pass;
diff --git a/media/libjpeg/jccoefct.c b/media/libjpeg/jccoefct.c
index a08d6e3230..068232a527 100644
--- a/media/libjpeg/jccoefct.c
+++ b/media/libjpeg/jccoefct.c
@@ -58,21 +58,19 @@ typedef my_coef_controller *my_coef_ptr;
/* Forward declarations */
-METHODDEF(boolean) compress_data
- (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
#ifdef FULL_COEF_BUFFER_SUPPORTED
-METHODDEF(boolean) compress_first_pass
- (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
-METHODDEF(boolean) compress_output
- (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_first_pass(j_compress_ptr cinfo,
+ JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
#endif
LOCAL(void)
-start_iMCU_row (j_compress_ptr cinfo)
+start_iMCU_row(j_compress_ptr cinfo)
/* Reset within-iMCU-row counters for a new row */
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
/* In an interleaved scan, an MCU row is the same as an iMCU row.
* In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -81,7 +79,7 @@ start_iMCU_row (j_compress_ptr cinfo)
if (cinfo->comps_in_scan > 1) {
coef->MCU_rows_per_iMCU_row = 1;
} else {
- if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1))
+ if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
else
coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
@@ -97,9 +95,9 @@ start_iMCU_row (j_compress_ptr cinfo)
*/
METHODDEF(void)
-start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
coef->iMCU_row_num = 0;
start_iMCU_row(cinfo);
@@ -140,9 +138,9 @@ start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
*/
METHODDEF(boolean)
-compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
JDIMENSION MCU_col_num; /* index of current MCU within row */
JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -167,31 +165,33 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
blkn = 0;
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
- blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
- : compptr->last_col_width;
+ blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+ compptr->last_col_width;
xpos = MCU_col_num * compptr->MCU_sample_width;
ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
if (coef->iMCU_row_num < last_iMCU_row ||
- yoffset+yindex < compptr->last_row_height) {
+ yoffset + yindex < compptr->last_row_height) {
(*cinfo->fdct->forward_DCT) (cinfo, compptr,
input_buf[compptr->component_index],
coef->MCU_buffer[blkn],
- ypos, xpos, (JDIMENSION) blockcnt);
+ ypos, xpos, (JDIMENSION)blockcnt);
if (blockcnt < compptr->MCU_width) {
/* Create some dummy blocks at the right edge of the image. */
- jzero_far((void *) coef->MCU_buffer[blkn + blockcnt],
+ jzero_far((void *)coef->MCU_buffer[blkn + blockcnt],
(compptr->MCU_width - blockcnt) * sizeof(JBLOCK));
for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
- coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0];
+ coef->MCU_buffer[blkn + bi][0][0] =
+ coef->MCU_buffer[blkn + bi - 1][0][0];
}
}
} else {
/* Create a row of dummy blocks at the bottom of the image. */
- jzero_far((void *) coef->MCU_buffer[blkn],
+ jzero_far((void *)coef->MCU_buffer[blkn],
compptr->MCU_width * sizeof(JBLOCK));
for (bi = 0; bi < compptr->MCU_width; bi++) {
- coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0];
+ coef->MCU_buffer[blkn + bi][0][0] =
+ coef->MCU_buffer[blkn - 1][0][0];
}
}
blkn += compptr->MCU_width;
@@ -201,7 +201,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
/* Try to write the MCU. In event of a suspension failure, we will
* re-DCT the MCU on restart (a bit inefficient, could be fixed...)
*/
- if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+ if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
/* Suspension forced; update state counters and exit */
coef->MCU_vert_offset = yoffset;
coef->mcu_ctr = MCU_col_num;
@@ -242,9 +242,9 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
*/
METHODDEF(boolean)
-compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_first_pass(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
JDIMENSION blocks_across, MCUs_across, MCUindex;
int bi, ci, h_samp_factor, block_row, block_rows, ndummy;
@@ -257,21 +257,21 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
ci++, compptr++) {
/* Align the virtual buffer for this component. */
buffer = (*cinfo->mem->access_virt_barray)
- ((j_common_ptr) cinfo, coef->whole_image[ci],
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
coef->iMCU_row_num * compptr->v_samp_factor,
- (JDIMENSION) compptr->v_samp_factor, TRUE);
+ (JDIMENSION)compptr->v_samp_factor, TRUE);
/* Count non-dummy DCT block rows in this iMCU row. */
if (coef->iMCU_row_num < last_iMCU_row)
block_rows = compptr->v_samp_factor;
else {
/* NB: can't use last_row_height here, since may not be set! */
- block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+ block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
if (block_rows == 0) block_rows = compptr->v_samp_factor;
}
blocks_across = compptr->width_in_blocks;
h_samp_factor = compptr->h_samp_factor;
/* Count number of dummy blocks to be added at the right margin. */
- ndummy = (int) (blocks_across % h_samp_factor);
+ ndummy = (int)(blocks_across % h_samp_factor);
if (ndummy > 0)
ndummy = h_samp_factor - ndummy;
/* Perform DCT for all non-dummy blocks in this iMCU row. Each call
@@ -281,12 +281,12 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
thisblockrow = buffer[block_row];
(*cinfo->fdct->forward_DCT) (cinfo, compptr,
input_buf[ci], thisblockrow,
- (JDIMENSION) (block_row * DCTSIZE),
- (JDIMENSION) 0, blocks_across);
+ (JDIMENSION)(block_row * DCTSIZE),
+ (JDIMENSION)0, blocks_across);
if (ndummy > 0) {
/* Create dummy blocks at the right edge of the image. */
thisblockrow += blocks_across; /* => first dummy block */
- jzero_far((void *) thisblockrow, ndummy * sizeof(JBLOCK));
+ jzero_far((void *)thisblockrow, ndummy * sizeof(JBLOCK));
lastDC = thisblockrow[-1][0];
for (bi = 0; bi < ndummy; bi++) {
thisblockrow[bi][0] = lastDC;
@@ -304,11 +304,11 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
for (block_row = block_rows; block_row < compptr->v_samp_factor;
block_row++) {
thisblockrow = buffer[block_row];
- lastblockrow = buffer[block_row-1];
- jzero_far((void *) thisblockrow,
- (size_t) (blocks_across * sizeof(JBLOCK)));
+ lastblockrow = buffer[block_row - 1];
+ jzero_far((void *)thisblockrow,
+ (size_t)(blocks_across * sizeof(JBLOCK)));
for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
- lastDC = lastblockrow[h_samp_factor-1][0];
+ lastDC = lastblockrow[h_samp_factor - 1][0];
for (bi = 0; bi < h_samp_factor; bi++) {
thisblockrow[bi][0] = lastDC;
}
@@ -338,9 +338,9 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
*/
METHODDEF(boolean)
-compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
JDIMENSION MCU_col_num; /* index of current MCU within row */
int blkn, ci, xindex, yindex, yoffset;
JDIMENSION start_col;
@@ -355,9 +355,9 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
buffer[ci] = (*cinfo->mem->access_virt_barray)
- ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+ ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
coef->iMCU_row_num * compptr->v_samp_factor,
- (JDIMENSION) compptr->v_samp_factor, FALSE);
+ (JDIMENSION)compptr->v_samp_factor, FALSE);
}
/* Loop to process one whole iMCU row */
@@ -371,14 +371,14 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
compptr = cinfo->cur_comp_info[ci];
start_col = MCU_col_num * compptr->MCU_width;
for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
- buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+ buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
coef->MCU_buffer[blkn++] = buffer_ptr++;
}
}
}
/* Try to write the MCU. */
- if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+ if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
/* Suspension forced; update state counters and exit */
coef->MCU_vert_offset = yoffset;
coef->mcu_ctr = MCU_col_num;
@@ -402,14 +402,14 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
*/
GLOBAL(void)
-jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_coef_controller(j_compress_ptr cinfo, boolean need_full_buffer)
{
my_coef_ptr coef;
coef = (my_coef_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_coef_controller));
- cinfo->coef = (struct jpeg_c_coef_controller *) coef;
+ cinfo->coef = (struct jpeg_c_coef_controller *)coef;
coef->pub.start_pass = start_pass_coef;
/* Create the coefficient buffer. */
@@ -423,12 +423,12 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
- (JDIMENSION) jround_up((long) compptr->width_in_blocks,
- (long) compptr->h_samp_factor),
- (JDIMENSION) jround_up((long) compptr->height_in_blocks,
- (long) compptr->v_samp_factor),
- (JDIMENSION) compptr->v_samp_factor);
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+ (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+ (long)compptr->h_samp_factor),
+ (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+ (long)compptr->v_samp_factor),
+ (JDIMENSION)compptr->v_samp_factor);
}
#else
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -439,7 +439,7 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
int i;
buffer = (JBLOCKROW)
- (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
coef->MCU_buffer[i] = buffer + i;
diff --git a/media/libjpeg/jccolext.c b/media/libjpeg/jccolext.c
index 479b320446..303b322ce6 100644
--- a/media/libjpeg/jccolext.c
+++ b/media/libjpeg/jccolext.c
@@ -29,13 +29,13 @@
INLINE
LOCAL(void)
-rgb_ycc_convert_internal (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
register int r, g, b;
- register JLONG * ctab = cconvert->rgb_ycc_tab;
+ register JLONG *ctab = cconvert->rgb_ycc_tab;
register JSAMPROW inptr;
register JSAMPROW outptr0, outptr1, outptr2;
register JDIMENSION col;
@@ -48,9 +48,9 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo,
outptr2 = output_buf[2][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
- r = GETJSAMPLE(inptr[RGB_RED]);
- g = GETJSAMPLE(inptr[RGB_GREEN]);
- b = GETJSAMPLE(inptr[RGB_BLUE]);
+ r = inptr[RGB_RED];
+ g = inptr[RGB_GREEN];
+ b = inptr[RGB_BLUE];
inptr += RGB_PIXELSIZE;
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
@@ -58,17 +58,14 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo,
* need the general RIGHT_SHIFT macro.
*/
/* Y */
- outptr0[col] = (JSAMPLE)
- ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
- >> SCALEBITS);
+ outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+ ctab[b + B_Y_OFF]) >> SCALEBITS);
/* Cb */
- outptr1[col] = (JSAMPLE)
- ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
- >> SCALEBITS);
+ outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+ ctab[b + B_CB_OFF]) >> SCALEBITS);
/* Cr */
- outptr2[col] = (JSAMPLE)
- ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
- >> SCALEBITS);
+ outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+ ctab[b + B_CR_OFF]) >> SCALEBITS);
}
}
}
@@ -86,13 +83,13 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo,
INLINE
LOCAL(void)
-rgb_gray_convert_internal (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
register int r, g, b;
- register JLONG * ctab = cconvert->rgb_ycc_tab;
+ register JLONG *ctab = cconvert->rgb_ycc_tab;
register JSAMPROW inptr;
register JSAMPROW outptr;
register JDIMENSION col;
@@ -103,14 +100,13 @@ rgb_gray_convert_internal (j_compress_ptr cinfo,
outptr = output_buf[0][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
- r = GETJSAMPLE(inptr[RGB_RED]);
- g = GETJSAMPLE(inptr[RGB_GREEN]);
- b = GETJSAMPLE(inptr[RGB_BLUE]);
+ r = inptr[RGB_RED];
+ g = inptr[RGB_GREEN];
+ b = inptr[RGB_BLUE];
inptr += RGB_PIXELSIZE;
/* Y */
- outptr[col] = (JSAMPLE)
- ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
- >> SCALEBITS);
+ outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+ ctab[b + B_Y_OFF]) >> SCALEBITS);
}
}
}
@@ -123,9 +119,9 @@ rgb_gray_convert_internal (j_compress_ptr cinfo,
INLINE
LOCAL(void)
-rgb_rgb_convert_internal (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+rgb_rgb_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
{
register JSAMPROW inptr;
register JSAMPROW outptr0, outptr1, outptr2;
@@ -139,9 +135,9 @@ rgb_rgb_convert_internal (j_compress_ptr cinfo,
outptr2 = output_buf[2][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
- outptr0[col] = GETJSAMPLE(inptr[RGB_RED]);
- outptr1[col] = GETJSAMPLE(inptr[RGB_GREEN]);
- outptr2[col] = GETJSAMPLE(inptr[RGB_BLUE]);
+ outptr0[col] = inptr[RGB_RED];
+ outptr1[col] = inptr[RGB_GREEN];
+ outptr2[col] = inptr[RGB_BLUE];
inptr += RGB_PIXELSIZE;
}
}
diff --git a/media/libjpeg/jccolor.c b/media/libjpeg/jccolor.c
index b973d101d6..bdc563c723 100644
--- a/media/libjpeg/jccolor.c
+++ b/media/libjpeg/jccolor.c
@@ -63,9 +63,9 @@ typedef my_color_converter *my_cconvert_ptr;
*/
#define SCALEBITS 16 /* speediest right-shift on some machines */
-#define CBCR_OFFSET ((JLONG) CENTERJSAMPLE << SCALEBITS)
-#define ONE_HALF ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x) ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define CBCR_OFFSET ((JLONG)CENTERJSAMPLE << SCALEBITS)
+#define ONE_HALF ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x) ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
/* We allocate one big table and divide it up into eight parts, instead of
* doing eight alloc_small requests. This lets us use a single table base
@@ -74,15 +74,15 @@ typedef my_color_converter *my_cconvert_ptr;
*/
#define R_Y_OFF 0 /* offset to R => Y section */
-#define G_Y_OFF (1*(MAXJSAMPLE+1)) /* offset to G => Y section */
-#define B_Y_OFF (2*(MAXJSAMPLE+1)) /* etc. */
-#define R_CB_OFF (3*(MAXJSAMPLE+1))
-#define G_CB_OFF (4*(MAXJSAMPLE+1))
-#define B_CB_OFF (5*(MAXJSAMPLE+1))
+#define G_Y_OFF (1 * (MAXJSAMPLE + 1)) /* offset to G => Y section */
+#define B_Y_OFF (2 * (MAXJSAMPLE + 1)) /* etc. */
+#define R_CB_OFF (3 * (MAXJSAMPLE + 1))
+#define G_CB_OFF (4 * (MAXJSAMPLE + 1))
+#define B_CB_OFF (5 * (MAXJSAMPLE + 1))
#define R_CR_OFF B_CB_OFF /* B=>Cb, R=>Cr are the same */
-#define G_CR_OFF (6*(MAXJSAMPLE+1))
-#define B_CR_OFF (7*(MAXJSAMPLE+1))
-#define TABLE_SIZE (8*(MAXJSAMPLE+1))
+#define G_CR_OFF (6 * (MAXJSAMPLE + 1))
+#define B_CR_OFF (7 * (MAXJSAMPLE + 1))
+#define TABLE_SIZE (8 * (MAXJSAMPLE + 1))
/* Include inline routines for colorspace extensions */
@@ -93,13 +93,13 @@ typedef my_color_converter *my_cconvert_ptr;
#undef RGB_BLUE
#undef RGB_PIXELSIZE
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define rgb_ycc_convert_internal extrgb_ycc_convert_internal
-#define rgb_gray_convert_internal extrgb_gray_convert_internal
-#define rgb_rgb_convert_internal extrgb_rgb_convert_internal
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define rgb_ycc_convert_internal extrgb_ycc_convert_internal
+#define rgb_gray_convert_internal extrgb_gray_convert_internal
+#define rgb_rgb_convert_internal extrgb_rgb_convert_internal
#include "jccolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -109,13 +109,13 @@ typedef my_color_converter *my_cconvert_ptr;
#undef rgb_gray_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define rgb_ycc_convert_internal extrgbx_ycc_convert_internal
-#define rgb_gray_convert_internal extrgbx_gray_convert_internal
-#define rgb_rgb_convert_internal extrgbx_rgb_convert_internal
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define rgb_ycc_convert_internal extrgbx_ycc_convert_internal
+#define rgb_gray_convert_internal extrgbx_gray_convert_internal
+#define rgb_rgb_convert_internal extrgbx_rgb_convert_internal
#include "jccolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -125,13 +125,13 @@ typedef my_color_converter *my_cconvert_ptr;
#undef rgb_gray_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define rgb_ycc_convert_internal extbgr_ycc_convert_internal
-#define rgb_gray_convert_internal extbgr_gray_convert_internal
-#define rgb_rgb_convert_internal extbgr_rgb_convert_internal
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define rgb_ycc_convert_internal extbgr_ycc_convert_internal
+#define rgb_gray_convert_internal extbgr_gray_convert_internal
+#define rgb_rgb_convert_internal extbgr_rgb_convert_internal
#include "jccolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -141,13 +141,13 @@ typedef my_color_converter *my_cconvert_ptr;
#undef rgb_gray_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define rgb_ycc_convert_internal extbgrx_ycc_convert_internal
-#define rgb_gray_convert_internal extbgrx_gray_convert_internal
-#define rgb_rgb_convert_internal extbgrx_rgb_convert_internal
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define rgb_ycc_convert_internal extbgrx_ycc_convert_internal
+#define rgb_gray_convert_internal extbgrx_gray_convert_internal
+#define rgb_rgb_convert_internal extbgrx_rgb_convert_internal
#include "jccolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -157,13 +157,13 @@ typedef my_color_converter *my_cconvert_ptr;
#undef rgb_gray_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define rgb_ycc_convert_internal extxbgr_ycc_convert_internal
-#define rgb_gray_convert_internal extxbgr_gray_convert_internal
-#define rgb_rgb_convert_internal extxbgr_rgb_convert_internal
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define rgb_ycc_convert_internal extxbgr_ycc_convert_internal
+#define rgb_gray_convert_internal extxbgr_gray_convert_internal
+#define rgb_rgb_convert_internal extxbgr_rgb_convert_internal
#include "jccolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -173,13 +173,13 @@ typedef my_color_converter *my_cconvert_ptr;
#undef rgb_gray_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define rgb_ycc_convert_internal extxrgb_ycc_convert_internal
-#define rgb_gray_convert_internal extxrgb_gray_convert_internal
-#define rgb_rgb_convert_internal extxrgb_rgb_convert_internal
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define rgb_ycc_convert_internal extxrgb_ycc_convert_internal
+#define rgb_gray_convert_internal extxrgb_gray_convert_internal
+#define rgb_rgb_convert_internal extxrgb_rgb_convert_internal
#include "jccolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -195,33 +195,33 @@ typedef my_color_converter *my_cconvert_ptr;
*/
METHODDEF(void)
-rgb_ycc_start (j_compress_ptr cinfo)
+rgb_ycc_start(j_compress_ptr cinfo)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
JLONG *rgb_ycc_tab;
JLONG i;
/* Allocate and fill in the conversion tables. */
cconvert->rgb_ycc_tab = rgb_ycc_tab = (JLONG *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
(TABLE_SIZE * sizeof(JLONG)));
for (i = 0; i <= MAXJSAMPLE; i++) {
- rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i;
- rgb_ycc_tab[i+G_Y_OFF] = FIX(0.58700) * i;
- rgb_ycc_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
- rgb_ycc_tab[i+R_CB_OFF] = (-FIX(0.16874)) * i;
- rgb_ycc_tab[i+G_CB_OFF] = (-FIX(0.33126)) * i;
+ rgb_ycc_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+ rgb_ycc_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+ rgb_ycc_tab[i + B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
+ rgb_ycc_tab[i + R_CB_OFF] = (-FIX(0.16874)) * i;
+ rgb_ycc_tab[i + G_CB_OFF] = (-FIX(0.33126)) * i;
/* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr.
* This ensures that the maximum output will round to MAXJSAMPLE
* not MAXJSAMPLE+1, and thus that we don't have to range-limit.
*/
- rgb_ycc_tab[i+B_CB_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF-1;
+ rgb_ycc_tab[i + B_CB_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF - 1;
/* B=>Cb and R=>Cr tables are the same
- rgb_ycc_tab[i+R_CR_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF-1;
+ rgb_ycc_tab[i + R_CR_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF - 1;
*/
- rgb_ycc_tab[i+G_CR_OFF] = (-FIX(0.41869)) * i;
- rgb_ycc_tab[i+B_CR_OFF] = (-FIX(0.08131)) * i;
+ rgb_ycc_tab[i + G_CR_OFF] = (-FIX(0.41869)) * i;
+ rgb_ycc_tab[i + B_CR_OFF] = (-FIX(0.08131)) * i;
}
}
@@ -231,43 +231,42 @@ rgb_ycc_start (j_compress_ptr cinfo)
*/
METHODDEF(void)
-rgb_ycc_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
{
switch (cinfo->in_color_space) {
- case JCS_EXT_RGB:
- extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_BGR:
- extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- default:
- rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
+ case JCS_EXT_RGB:
+ extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ default:
+ rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
}
}
@@ -280,43 +279,42 @@ rgb_ycc_convert (j_compress_ptr cinfo,
*/
METHODDEF(void)
-rgb_gray_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
{
switch (cinfo->in_color_space) {
- case JCS_EXT_RGB:
- extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_BGR:
- extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- default:
- rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
+ case JCS_EXT_RGB:
+ extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ default:
+ rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
}
}
@@ -326,43 +324,42 @@ rgb_gray_convert (j_compress_ptr cinfo,
*/
METHODDEF(void)
-rgb_rgb_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+rgb_rgb_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
{
switch (cinfo->in_color_space) {
- case JCS_EXT_RGB:
- extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_BGR:
- extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
- default:
- rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
- num_rows);
- break;
+ case JCS_EXT_RGB:
+ extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ default:
+ rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
}
}
@@ -376,11 +373,10 @@ rgb_rgb_convert (j_compress_ptr cinfo,
*/
METHODDEF(void)
-cmyk_ycck_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
register int r, g, b;
register JLONG *ctab = cconvert->rgb_ycc_tab;
register JSAMPROW inptr;
@@ -396,11 +392,11 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
outptr3 = output_buf[3][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
- r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
- g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
- b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
+ r = MAXJSAMPLE - inptr[0];
+ g = MAXJSAMPLE - inptr[1];
+ b = MAXJSAMPLE - inptr[2];
/* K passes through as-is */
- outptr3[col] = inptr[3]; /* don't need GETJSAMPLE here */
+ outptr3[col] = inptr[3];
inptr += 4;
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
@@ -408,17 +404,14 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
* need the general RIGHT_SHIFT macro.
*/
/* Y */
- outptr0[col] = (JSAMPLE)
- ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
- >> SCALEBITS);
+ outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+ ctab[b + B_Y_OFF]) >> SCALEBITS);
/* Cb */
- outptr1[col] = (JSAMPLE)
- ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
- >> SCALEBITS);
+ outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+ ctab[b + B_CB_OFF]) >> SCALEBITS);
/* Cr */
- outptr2[col] = (JSAMPLE)
- ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
- >> SCALEBITS);
+ outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+ ctab[b + B_CR_OFF]) >> SCALEBITS);
}
}
}
@@ -431,9 +424,8 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
*/
METHODDEF(void)
-grayscale_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+grayscale_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
{
register JSAMPROW inptr;
register JSAMPROW outptr;
@@ -446,7 +438,7 @@ grayscale_convert (j_compress_ptr cinfo,
outptr = output_buf[0][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
- outptr[col] = inptr[0]; /* don't need GETJSAMPLE() here */
+ outptr[col] = inptr[0];
inptr += instride;
}
}
@@ -460,9 +452,8 @@ grayscale_convert (j_compress_ptr cinfo,
*/
METHODDEF(void)
-null_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
{
register JSAMPROW inptr;
register JSAMPROW outptr, outptr0, outptr1, outptr2, outptr3;
@@ -506,7 +497,7 @@ null_convert (j_compress_ptr cinfo,
inptr = *input_buf;
outptr = output_buf[ci][output_row];
for (col = 0; col < num_cols; col++) {
- outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
+ outptr[col] = inptr[ci];
inptr += nc;
}
}
@@ -522,7 +513,7 @@ null_convert (j_compress_ptr cinfo,
*/
METHODDEF(void)
-null_method (j_compress_ptr cinfo)
+null_method(j_compress_ptr cinfo)
{
/* no work needed */
}
@@ -533,14 +524,14 @@ null_method (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jinit_color_converter (j_compress_ptr cinfo)
+jinit_color_converter(j_compress_ptr cinfo)
{
my_cconvert_ptr cconvert;
cconvert = (my_cconvert_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_color_converter));
- cinfo->cconvert = (struct jpeg_color_converter *) cconvert;
+ cinfo->cconvert = (struct jpeg_color_converter *)cconvert;
/* set start_pass to null method until we find out differently */
cconvert->pub.start_pass = null_method;
diff --git a/media/libjpeg/jcdctmgr.c b/media/libjpeg/jcdctmgr.c
index aef8517f9c..7dae17a6e1 100644
--- a/media/libjpeg/jcdctmgr.c
+++ b/media/libjpeg/jcdctmgr.c
@@ -41,7 +41,7 @@ typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
FAST_FLOAT *divisors,
FAST_FLOAT *workspace);
-METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
+METHODDEF(void) quantize(JCOEFPTR, DCTELEM *, DCTELEM *);
typedef struct {
struct jpeg_forward_dct pub; /* public fields */
@@ -80,7 +80,7 @@ typedef my_fdct_controller *my_fdct_ptr;
*/
LOCAL(int)
-flss (UINT16 val)
+flss(UINT16 val)
{
int bit;
@@ -170,7 +170,7 @@ flss (UINT16 val)
*/
LOCAL(int)
-compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
+compute_reciprocal(UINT16 divisor, DCTELEM *dtbl)
{
UDCTELEM2 fq, fr;
UDCTELEM c;
@@ -182,10 +182,10 @@ compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
* identity function. Since only the C quantization algorithm is used in
* these cases, the scale value is irrelevant.
*/
- dtbl[DCTSIZE2 * 0] = (DCTELEM) 1; /* reciprocal */
- dtbl[DCTSIZE2 * 1] = (DCTELEM) 0; /* correction */
- dtbl[DCTSIZE2 * 2] = (DCTELEM) 1; /* scale */
- dtbl[DCTSIZE2 * 3] = -(DCTELEM) (sizeof(DCTELEM) * 8); /* shift */
+ dtbl[DCTSIZE2 * 0] = (DCTELEM)1; /* reciprocal */
+ dtbl[DCTSIZE2 * 1] = (DCTELEM)0; /* correction */
+ dtbl[DCTSIZE2 * 2] = (DCTELEM)1; /* scale */
+ dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8); /* shift */
return 0;
}
@@ -195,28 +195,28 @@ compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
fq = ((UDCTELEM2)1 << r) / divisor;
fr = ((UDCTELEM2)1 << r) % divisor;
- c = divisor / 2; /* for rounding */
+ c = divisor / 2; /* for rounding */
- if (fr == 0) { /* divisor is power of two */
+ if (fr == 0) { /* divisor is power of two */
/* fq will be one bit too large to fit in DCTELEM, so adjust */
fq >>= 1;
r--;
- } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
+ } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
c++;
- } else { /* fractional part is > 0.5 */
+ } else { /* fractional part is > 0.5 */
fq++;
}
- dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */
- dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
+ dtbl[DCTSIZE2 * 0] = (DCTELEM)fq; /* reciprocal */
+ dtbl[DCTSIZE2 * 1] = (DCTELEM)c; /* correction + roundfactor */
#ifdef WITH_SIMD
- dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */
+ dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
#else
dtbl[DCTSIZE2 * 2] = 1;
#endif
- dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+ dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
- if(r <= 16) return 0;
+ if (r <= 16) return 0;
else return 1;
}
@@ -233,9 +233,9 @@ compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
*/
METHODDEF(void)
-start_pass_fdctmgr (j_compress_ptr cinfo)
+start_pass_fdctmgr(j_compress_ptr cinfo)
{
- my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+ my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
int ci, qtblno, i;
jpeg_component_info *compptr;
JQUANT_TBL *qtbl;
@@ -259,7 +259,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
*/
if (fdct->divisors[qtblno] == NULL) {
fdct->divisors[qtblno] = (DCTELEM *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
(DCTSIZE2 * 4) * sizeof(DCTELEM));
}
dtbl = fdct->divisors[qtblno];
@@ -269,7 +269,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
fdct->quantize == jsimd_quantize)
fdct->quantize = quantize;
#else
- dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+ dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3;
#endif
}
break;
@@ -283,7 +283,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
* scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
* We apply a further scale factor of 8.
*/
-#define CONST_BITS 14
+#define CONST_BITS 14
static const INT16 aanscales[DCTSIZE2] = {
/* precomputed values scaled up by 14 bits */
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
@@ -299,23 +299,23 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
if (fdct->divisors[qtblno] == NULL) {
fdct->divisors[qtblno] = (DCTELEM *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
(DCTSIZE2 * 4) * sizeof(DCTELEM));
}
dtbl = fdct->divisors[qtblno];
for (i = 0; i < DCTSIZE2; i++) {
#if BITS_IN_JSAMPLE == 8
if (!compute_reciprocal(
- DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
- (JLONG) aanscales[i]),
- CONST_BITS-3), &dtbl[i]) &&
+ DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+ (JLONG)aanscales[i]),
+ CONST_BITS - 3), &dtbl[i]) &&
fdct->quantize == jsimd_quantize)
fdct->quantize = quantize;
#else
- dtbl[i] = (DCTELEM)
- DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
- (JLONG) aanscales[i]),
- CONST_BITS-3);
+ dtbl[i] = (DCTELEM)
+ DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+ (JLONG)aanscales[i]),
+ CONST_BITS - 3);
#endif
}
}
@@ -341,7 +341,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
if (fdct->float_divisors[qtblno] == NULL) {
fdct->float_divisors[qtblno] = (FAST_FLOAT *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
DCTSIZE2 * sizeof(FAST_FLOAT));
}
fdtbl = fdct->float_divisors[qtblno];
@@ -349,7 +349,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
for (row = 0; row < DCTSIZE; row++) {
for (col = 0; col < DCTSIZE; col++) {
fdtbl[i] = (FAST_FLOAT)
- (1.0 / (((double) qtbl->quantval[i] *
+ (1.0 / (((double)qtbl->quantval[i] *
aanscalefactor[row] * aanscalefactor[col] * 8.0)));
i++;
}
@@ -370,7 +370,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
*/
METHODDEF(void)
-convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
+convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
{
register DCTELEM *workspaceptr;
register JSAMPROW elemptr;
@@ -381,19 +381,19 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
elemptr = sample_data[elemr] + start_col;
#if DCTSIZE == 8 /* unroll the inner loop */
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
#else
{
register int elemc;
for (elemc = DCTSIZE; elemc > 0; elemc--)
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
}
#endif
}
@@ -405,7 +405,7 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
*/
METHODDEF(void)
-quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
{
int i;
DCTELEM temp;
@@ -426,15 +426,15 @@ quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
if (temp < 0) {
temp = -temp;
product = (UDCTELEM2)(temp + corr) * recip;
- product >>= shift + sizeof(DCTELEM)*8;
+ product >>= shift + sizeof(DCTELEM) * 8;
temp = (DCTELEM)product;
temp = -temp;
} else {
product = (UDCTELEM2)(temp + corr) * recip;
- product >>= shift + sizeof(DCTELEM)*8;
+ product >>= shift + sizeof(DCTELEM) * 8;
temp = (DCTELEM)product;
}
- output_ptr[i] = (JCOEF) temp;
+ output_ptr[i] = (JCOEF)temp;
}
#else
@@ -457,20 +457,20 @@ quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
* If your machine's division is fast enough, define FAST_DIVIDE.
*/
#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b) a /= b
+#define DIVIDE_BY(a, b) a /= b
#else
-#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0
+#define DIVIDE_BY(a, b) if (a >= b) a /= b; else a = 0
#endif
if (temp < 0) {
temp = -temp;
- temp += qval>>1; /* for rounding */
+ temp += qval >> 1; /* for rounding */
DIVIDE_BY(temp, qval);
temp = -temp;
} else {
- temp += qval>>1; /* for rounding */
+ temp += qval >> 1; /* for rounding */
DIVIDE_BY(temp, qval);
}
- output_ptr[i] = (JCOEF) temp;
+ output_ptr[i] = (JCOEF)temp;
}
#endif
@@ -487,14 +487,13 @@ quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
*/
METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
- JDIMENSION start_row, JDIMENSION start_col,
- JDIMENSION num_blocks)
+forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+ JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks)
/* This version is used for integer DCT implementations. */
{
/* This routine is heavily used, so it's worth coding it tightly. */
- my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+ my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
DCTELEM *workspace;
JDIMENSION bi;
@@ -522,9 +521,9 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
#ifdef DCT_FLOAT_SUPPORTED
-
METHODDEF(void)
-convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace)
+convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
{
register FAST_FLOAT *workspaceptr;
register JSAMPROW elemptr;
@@ -534,20 +533,19 @@ convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *worksp
for (elemr = 0; elemr < DCTSIZE; elemr++) {
elemptr = sample_data[elemr] + start_col;
#if DCTSIZE == 8 /* unroll the inner loop */
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
#else
{
register int elemc;
for (elemc = DCTSIZE; elemc > 0; elemc--)
- *workspaceptr++ = (FAST_FLOAT)
- (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
}
#endif
}
@@ -555,7 +553,8 @@ convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *worksp
METHODDEF(void)
-quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace)
+quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
{
register FAST_FLOAT temp;
register int i;
@@ -571,20 +570,20 @@ quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace
* The maximum coefficient size is +-16K (for 12-bit data), so this
* code should work for either 16-bit or 32-bit ints.
*/
- output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+ output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384);
}
}
METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
- JDIMENSION start_row, JDIMENSION start_col,
- JDIMENSION num_blocks)
+forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+ JDIMENSION start_row, JDIMENSION start_col,
+ JDIMENSION num_blocks)
/* This version is used for floating-point DCT implementations. */
{
/* This routine is heavily used, so it's worth coding it tightly. */
- my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+ my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
FAST_FLOAT *workspace;
JDIMENSION bi;
@@ -618,15 +617,15 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jinit_forward_dct (j_compress_ptr cinfo)
+jinit_forward_dct(j_compress_ptr cinfo)
{
my_fdct_ptr fdct;
int i;
fdct = (my_fdct_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_fdct_controller));
- cinfo->fdct = (struct jpeg_forward_dct *) fdct;
+ cinfo->fdct = (struct jpeg_forward_dct *)fdct;
fdct->pub.start_pass = start_pass_fdctmgr;
/* First determine the DCT... */
@@ -703,12 +702,12 @@ jinit_forward_dct (j_compress_ptr cinfo)
#ifdef DCT_FLOAT_SUPPORTED
if (cinfo->dct_method == JDCT_FLOAT)
fdct->float_workspace = (FAST_FLOAT *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(FAST_FLOAT) * DCTSIZE2);
else
#endif
fdct->workspace = (DCTELEM *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(DCTELEM) * DCTSIZE2);
/* Mark divisor tables unallocated */
diff --git a/media/libjpeg/jchuff.c b/media/libjpeg/jchuff.c
index fffaacebce..f4dfa1cb54 100644
--- a/media/libjpeg/jchuff.c
+++ b/media/libjpeg/jchuff.c
@@ -4,8 +4,10 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -16,6 +18,9 @@
* back up to the start of the current MCU. To do this, we copy state
* variables into local working storage, and update them back to the
* permanent JPEG objects only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
@@ -31,32 +36,33 @@
* memory footprint by 64k, which is important for some mobile applications
* that create many isolated instances of libjpeg-turbo (web browsers, for
* instance.) This may improve performance on some mobile platforms as well.
- * This feature is enabled by default only on ARM processors, because some x86
+ * This feature is enabled by default only on Arm processors, because some x86
* chips have a slow implementation of bsr, and the use of clz/bsr cannot be
* shown to have a significant performance impact even on the x86 chips that
- * have a fast implementation of it. When building for ARMv6, you can
+ * have a fast implementation of it. When building for Armv6, you can
* explicitly disable the use of clz/bsr by adding -mthumb to the compiler
* flags (this defines __thumb__).
*/
/* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-#if !defined __thumb__ || defined __thumb2__
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+ defined(_M_ARM) || defined(_M_ARM64)
+#if !defined(__thumb__) || defined(__thumb2__)
#define USE_CLZ_INTRINSIC
#endif
#endif
#ifdef USE_CLZ_INTRINSIC
-#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
-#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x))
#else
-#include "jpeg_nbits_table.h"
-#define JPEG_NBITS(x) (jpeg_nbits_table[x])
-#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
#endif
-
-#ifndef min
- #define min(a,b) ((a)<(b)?(a):(b))
+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+#include "jpeg_nbits_table.h"
+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
#endif
@@ -66,31 +72,42 @@
* but must not be updated permanently until we complete the MCU.
*/
-typedef struct {
- size_t put_buffer; /* current bit-accumulation buffer */
- int put_bits; /* # of bits now in it */
- int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
-} savable_state;
+#if defined(__x86_64__) && defined(__ILP32__)
+typedef unsigned long long bit_buf_type;
+#else
+typedef size_t bit_buf_type;
+#endif
-/* This macro is to work around compilers with missing or broken
- * structure assignment. You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
+/* NOTE: The more optimal Huffman encoding algorithm is only used by the
+ * intrinsics implementation of the Arm Neon SIMD extensions, which is why we
+ * retain the old Huffman encoder behavior when using the GAS implementation.
*/
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src) ((dest) = (src))
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
+ defined(_M_ARM) || defined(_M_ARM64))
+typedef unsigned long long simd_bit_buf_type;
#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src) \
- ((dest).put_buffer = (src).put_buffer, \
- (dest).put_bits = (src).put_bits, \
- (dest).last_dc_val[0] = (src).last_dc_val[0], \
- (dest).last_dc_val[1] = (src).last_dc_val[1], \
- (dest).last_dc_val[2] = (src).last_dc_val[2], \
- (dest).last_dc_val[3] = (src).last_dc_val[3])
+typedef bit_buf_type simd_bit_buf_type;
#endif
+
+#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \
+ (defined(__x86_64__) && defined(__ILP32__))
+#define BIT_BUF_SIZE 64
+#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)
+#define BIT_BUF_SIZE 32
+#else
+#error Cannot determine word size
#endif
+#define SIMD_BIT_BUF_SIZE (sizeof(simd_bit_buf_type) * 8)
+typedef struct {
+ union {
+ bit_buf_type c;
+ simd_bit_buf_type simd;
+ } put_buffer; /* current bit accumulation buffer */
+ int free_bits; /* # of bits available in it */
+ /* (Neon GAS: # of bits now in it) */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+} savable_state;
typedef struct {
struct jpeg_entropy_encoder pub; /* public fields */
@@ -124,16 +141,17 @@ typedef struct {
size_t free_in_buffer; /* # of byte spaces remaining in buffer */
savable_state cur; /* Current bit buffer & DC state */
j_compress_ptr cinfo; /* dump_buffer needs access to this */
+ int simd;
} working_state;
/* Forward declarations */
-METHODDEF(boolean) encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_huff (j_compress_ptr cinfo);
+METHODDEF(boolean) encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_huff(j_compress_ptr cinfo);
#ifdef ENTROPY_OPT_SUPPORTED
-METHODDEF(boolean) encode_mcu_gather (j_compress_ptr cinfo,
- JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo);
+METHODDEF(boolean) encode_mcu_gather(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_gather(j_compress_ptr cinfo);
#endif
@@ -144,9 +162,9 @@ METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo);
*/
METHODDEF(void)
-start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
int ci, dctbl, actbl;
jpeg_component_info *compptr;
@@ -180,30 +198,39 @@ start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
/* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
if (entropy->dc_count_ptrs[dctbl] == NULL)
entropy->dc_count_ptrs[dctbl] = (long *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
- MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long));
+ memset(entropy->dc_count_ptrs[dctbl], 0, 257 * sizeof(long));
if (entropy->ac_count_ptrs[actbl] == NULL)
entropy->ac_count_ptrs[actbl] = (long *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
- MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long));
+ memset(entropy->ac_count_ptrs[actbl], 0, 257 * sizeof(long));
#endif
} else {
/* Compute derived values for Huffman tables */
/* We may do this more than once for a table, but it's not expensive */
jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
- & entropy->dc_derived_tbls[dctbl]);
+ &entropy->dc_derived_tbls[dctbl]);
jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
- & entropy->ac_derived_tbls[actbl]);
+ &entropy->ac_derived_tbls[actbl]);
}
/* Initialize DC predictions to 0 */
entropy->saved.last_dc_val[ci] = 0;
}
/* Initialize bit buffer to empty */
- entropy->saved.put_buffer = 0;
- entropy->saved.put_bits = 0;
+ if (entropy->simd) {
+ entropy->saved.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+ entropy->saved.free_bits = 0;
+#else
+ entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+ } else {
+ entropy->saved.put_buffer.c = 0;
+ entropy->saved.free_bits = BIT_BUF_SIZE;
+ }
/* Initialize restart stuff */
entropy->restarts_to_go = cinfo->restart_interval;
@@ -219,8 +246,8 @@ start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
*/
GLOBAL(void)
-jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
- c_derived_tbl **pdtbl)
+jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
+ c_derived_tbl **pdtbl)
{
JHUFF_TBL *htbl;
c_derived_tbl *dtbl;
@@ -244,7 +271,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
/* Allocate a workspace if we haven't already done so. */
if (*pdtbl == NULL)
*pdtbl = (c_derived_tbl *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(c_derived_tbl));
dtbl = *pdtbl;
@@ -252,11 +279,11 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
p = 0;
for (l = 1; l <= 16; l++) {
- i = (int) htbl->bits[l];
+ i = (int)htbl->bits[l];
if (i < 0 || p + i > 256) /* protect against table overrun */
ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
while (i--)
- huffsize[p++] = (char) l;
+ huffsize[p++] = (char)l;
}
huffsize[p] = 0;
lastp = p;
@@ -268,14 +295,14 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
si = huffsize[0];
p = 0;
while (huffsize[p]) {
- while (((int) huffsize[p]) == si) {
+ while (((int)huffsize[p]) == si) {
huffcode[p++] = code;
code++;
}
/* code is now 1 more than the last code used for codelength si; but
* it must still fit in si bits, since no code is allowed to be all ones.
*/
- if (((JLONG) code) >= (((JLONG) 1) << si))
+ if (((JLONG)code) >= (((JLONG)1) << si))
ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
code <<= 1;
si++;
@@ -288,7 +315,8 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
* this lets us detect duplicate VAL entries here, and later
* allows emit_bits to detect any attempt to emit such symbols.
*/
- MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi));
+ memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco));
+ memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi));
/* This is also a convenient place to check for out-of-range
* and duplicated VAL entries. We allow 0..255 for AC symbols
@@ -310,20 +338,21 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
/* Outputting bytes to the file */
/* Emit a byte, taking 'action' if must suspend. */
-#define emit_byte(state,val,action) \
- { *(state)->next_output_byte++ = (JOCTET) (val); \
- if (--(state)->free_in_buffer == 0) \
- if (! dump_buffer(state)) \
- { action; } }
+#define emit_byte(state, val, action) { \
+ *(state)->next_output_byte++ = (JOCTET)(val); \
+ if (--(state)->free_in_buffer == 0) \
+ if (!dump_buffer(state)) \
+ { action; } \
+}
LOCAL(boolean)
-dump_buffer (working_state *state)
+dump_buffer(working_state *state)
/* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
{
struct jpeg_destination_mgr *dest = state->cinfo->dest;
- if (! (*dest->empty_output_buffer) (state->cinfo))
+ if (!(*dest->empty_output_buffer) (state->cinfo))
return FALSE;
/* After a successful buffer dump, must reset buffer pointers */
state->next_output_byte = dest->next_output_byte;
@@ -334,89 +363,93 @@ dump_buffer (working_state *state)
/* Outputting bits to the file */
-/* These macros perform the same task as the emit_bits() function in the
- * original libjpeg code. In addition to reducing overhead by explicitly
- * inlining the code, additional performance is achieved by taking into
- * account the size of the bit buffer and waiting until it is almost full
- * before emptying it. This mostly benefits 64-bit platforms, since 6
- * bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be
+ * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the
+ * byte is 0xFF. Otherwise, the output buffer pointer is advanced by 1, and
+ * the speculative 0 byte will be overwritten by the next byte.
*/
-
-#define EMIT_BYTE() { \
- JOCTET c; \
- put_bits -= 8; \
- c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \
- *buffer++ = c; \
- if (c == 0xFF) /* need to stuff a zero byte? */ \
- *buffer++ = 0; \
- }
-
-#define PUT_BITS(code, size) { \
- put_bits += size; \
- put_buffer = (put_buffer << size) | code; \
+#define EMIT_BYTE(b) { \
+ buffer[0] = (JOCTET)(b); \
+ buffer[1] = 0; \
+ buffer -= -2 + ((JOCTET)(b) < 0xFF); \
}
-#define CHECKBUF15() { \
- if (put_bits > 15) { \
- EMIT_BYTE() \
- EMIT_BYTE() \
+/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
+ * directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if BIT_BUF_SIZE == 64
+
+#define FLUSH() { \
+ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+ EMIT_BYTE(put_buffer >> 56) \
+ EMIT_BYTE(put_buffer >> 48) \
+ EMIT_BYTE(put_buffer >> 40) \
+ EMIT_BYTE(put_buffer >> 32) \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ buffer[0] = (JOCTET)(put_buffer >> 56); \
+ buffer[1] = (JOCTET)(put_buffer >> 48); \
+ buffer[2] = (JOCTET)(put_buffer >> 40); \
+ buffer[3] = (JOCTET)(put_buffer >> 32); \
+ buffer[4] = (JOCTET)(put_buffer >> 24); \
+ buffer[5] = (JOCTET)(put_buffer >> 16); \
+ buffer[6] = (JOCTET)(put_buffer >> 8); \
+ buffer[7] = (JOCTET)(put_buffer); \
+ buffer += 8; \
} \
}
-#define CHECKBUF31() { \
- if (put_bits > 31) { \
- EMIT_BYTE() \
- EMIT_BYTE() \
- EMIT_BYTE() \
- EMIT_BYTE() \
- } \
-}
+#else
-#define CHECKBUF47() { \
- if (put_bits > 47) { \
- EMIT_BYTE() \
- EMIT_BYTE() \
- EMIT_BYTE() \
- EMIT_BYTE() \
- EMIT_BYTE() \
- EMIT_BYTE() \
+#define FLUSH() { \
+ if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ buffer[0] = (JOCTET)(put_buffer >> 24); \
+ buffer[1] = (JOCTET)(put_buffer >> 16); \
+ buffer[2] = (JOCTET)(put_buffer >> 8); \
+ buffer[3] = (JOCTET)(put_buffer); \
+ buffer += 4; \
} \
}
-#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
-#error Cannot determine word size
#endif
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
-
-#define EMIT_BITS(code, size) { \
- CHECKBUF47() \
- PUT_BITS(code, size) \
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+ FLUSH() \
+ free_bits += BIT_BUF_SIZE; \
+ put_buffer = code; \
}
-#define EMIT_CODE(code, size) { \
- temp2 &= (((JLONG) 1)<<nbits) - 1; \
- CHECKBUF31() \
- PUT_BITS(code, size) \
- PUT_BITS(temp2, nbits) \
- }
-
-#else
-
-#define EMIT_BITS(code, size) { \
- PUT_BITS(code, size) \
- CHECKBUF15() \
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+ free_bits -= size; \
+ if (free_bits < 0) \
+ PUT_AND_FLUSH(code, size) \
+ else \
+ put_buffer = (put_buffer << size) | code; \
}
-#define EMIT_CODE(code, size) { \
- temp2 &= (((JLONG) 1)<<nbits) - 1; \
- PUT_BITS(code, size) \
- CHECKBUF15() \
- PUT_BITS(temp2, nbits) \
- CHECKBUF15() \
- }
-
-#endif
+#define PUT_CODE(code, size) { \
+ temp &= (((JLONG)1) << nbits) - 1; \
+ temp |= code << nbits; \
+ nbits += size; \
+ PUT_BITS(temp, nbits) \
+}
/* Although it is exceedingly rare, it is possible for a Huffman-encoded
@@ -428,55 +461,81 @@ dump_buffer (working_state *state)
* scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
* larger than 200 bytes.
*/
-#define BUFSIZE (DCTSIZE2 * 4)
+#define BUFSIZE (DCTSIZE2 * 8)
#define LOAD_BUFFER() { \
if (state->free_in_buffer < BUFSIZE) { \
localbuf = 1; \
buffer = _buffer; \
- } \
- else buffer = state->next_output_byte; \
- }
+ } else \
+ buffer = state->next_output_byte; \
+}
#define STORE_BUFFER() { \
if (localbuf) { \
+ size_t bytes, bytestocopy; \
bytes = buffer - _buffer; \
buffer = _buffer; \
while (bytes > 0) { \
- bytestocopy = min(bytes, state->free_in_buffer); \
- MEMCOPY(state->next_output_byte, buffer, bytestocopy); \
+ bytestocopy = MIN(bytes, state->free_in_buffer); \
+ memcpy(state->next_output_byte, buffer, bytestocopy); \
state->next_output_byte += bytestocopy; \
buffer += bytestocopy; \
state->free_in_buffer -= bytestocopy; \
if (state->free_in_buffer == 0) \
- if (! dump_buffer(state)) return FALSE; \
+ if (!dump_buffer(state)) return FALSE; \
bytes -= bytestocopy; \
} \
- } \
- else { \
+ } else { \
state->free_in_buffer -= (buffer - state->next_output_byte); \
state->next_output_byte = buffer; \
} \
- }
+}
LOCAL(boolean)
-flush_bits (working_state *state)
+flush_bits(working_state *state)
{
- JOCTET _buffer[BUFSIZE], *buffer;
- size_t put_buffer; int put_bits;
- size_t bytes, bytestocopy; int localbuf = 0;
+ JOCTET _buffer[BUFSIZE], *buffer, temp;
+ simd_bit_buf_type put_buffer; int put_bits;
+ int localbuf = 0;
+
+ if (state->simd) {
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+ put_bits = state->cur.free_bits;
+#else
+ put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
+#endif
+ put_buffer = state->cur.put_buffer.simd;
+ } else {
+ put_bits = BIT_BUF_SIZE - state->cur.free_bits;
+ put_buffer = state->cur.put_buffer.c;
+ }
- put_buffer = state->cur.put_buffer;
- put_bits = state->cur.put_bits;
LOAD_BUFFER()
- /* fill any partial byte with ones */
- PUT_BITS(0x7F, 7)
- while (put_bits >= 8) EMIT_BYTE()
+ while (put_bits >= 8) {
+ put_bits -= 8;
+ temp = (JOCTET)(put_buffer >> put_bits);
+ EMIT_BYTE(temp)
+ }
+ if (put_bits) {
+ /* fill partial byte with ones */
+ temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));
+ EMIT_BYTE(temp)
+ }
- state->cur.put_buffer = 0; /* and reset bit-buffer to empty */
- state->cur.put_bits = 0;
+ if (state->simd) { /* and reset bit buffer to empty */
+ state->cur.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+ state->cur.free_bits = 0;
+#else
+ state->cur.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+ } else {
+ state->cur.put_buffer.c = 0;
+ state->cur.free_bits = BIT_BUF_SIZE;
+ }
STORE_BUFFER()
return TRUE;
@@ -486,11 +545,11 @@ flush_bits (working_state *state)
/* Encode a single block's worth of coefficients */
LOCAL(boolean)
-encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val,
- c_derived_tbl *dctbl, c_derived_tbl *actbl)
+encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl)
{
JOCTET _buffer[BUFSIZE], *buffer;
- size_t bytes, bytestocopy; int localbuf = 0;
+ int localbuf = 0;
LOAD_BUFFER()
@@ -503,108 +562,91 @@ encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val,
}
LOCAL(boolean)
-encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val,
- c_derived_tbl *dctbl, c_derived_tbl *actbl)
+encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl)
{
- int temp, temp2, temp3;
- int nbits;
- int r, code, size;
+ int temp, nbits, free_bits;
+ bit_buf_type put_buffer;
JOCTET _buffer[BUFSIZE], *buffer;
- size_t put_buffer; int put_bits;
- int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
- size_t bytes, bytestocopy; int localbuf = 0;
+ int localbuf = 0;
- put_buffer = state->cur.put_buffer;
- put_bits = state->cur.put_bits;
+ free_bits = state->cur.free_bits;
+ put_buffer = state->cur.put_buffer.c;
LOAD_BUFFER()
/* Encode the DC coefficient difference per section F.1.2.1 */
- temp = temp2 = block[0] - last_dc_val;
-
- /* This is a well-known technique for obtaining the absolute value without a
- * branch. It is derived from an assembly language technique presented in
- * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
- * Agner Fog.
- */
- temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
- temp ^= temp3;
- temp -= temp3;
+ temp = block[0] - last_dc_val;
- /* For a negative input, want temp2 = bitwise complement of abs(input) */
- /* This code assumes we are on a two's complement machine */
- temp2 += temp3;
+ /* This is a well-known technique for obtaining the absolute value without a
+ * branch. It is derived from an assembly language technique presented in
+ * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
+ * Agner Fog. This code assumes we are on a two's complement machine.
+ */
+ nbits = temp >> (CHAR_BIT * sizeof(int) - 1);
+ temp += nbits;
+ nbits ^= temp;
/* Find the number of bits needed for the magnitude of the coefficient */
- nbits = JPEG_NBITS(temp);
-
- /* Emit the Huffman-coded symbol for the number of bits */
- code = dctbl->ehufco[nbits];
- size = dctbl->ehufsi[nbits];
- EMIT_BITS(code, size)
+ nbits = JPEG_NBITS(nbits);
- /* Mask off any extra bits in code */
- temp2 &= (((JLONG) 1)<<nbits) - 1;
-
- /* Emit that number of bits of the value, if positive, */
- /* or the complement of its magnitude, if negative. */
- EMIT_BITS(temp2, nbits)
+ /* Emit the Huffman-coded symbol for the number of bits.
+ * Emit that number of bits of the value, if positive,
+ * or the complement of its magnitude, if negative.
+ */
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])
/* Encode the AC coefficients per section F.1.2.2 */
- r = 0; /* r = run length of zeros */
+ {
+ int r = 0; /* r = run length of zeros */
/* Manually unroll the k loop to eliminate the counter variable. This
* improves performance greatly on systems with a limited number of
* registers (such as x86.)
*/
-#define kloop(jpeg_natural_order_of_k) { \
+#define kloop(jpeg_natural_order_of_k) { \
if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
- r++; \
+ r += 16; \
} else { \
- temp2 = temp; \
/* Branch-less absolute value, bitwise complement, etc., same as above */ \
- temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \
- temp ^= temp3; \
- temp -= temp3; \
- temp2 += temp3; \
- nbits = JPEG_NBITS_NONZERO(temp); \
+ nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp += nbits; \
+ nbits ^= temp; \
+ nbits = JPEG_NBITS_NONZERO(nbits); \
/* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
- while (r > 15) { \
- EMIT_BITS(code_0xf0, size_0xf0) \
- r -= 16; \
+ while (r >= 16 * 16) { \
+ r -= 16 * 16; \
+ PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \
} \
/* Emit Huffman symbol for run length / number of bits */ \
- temp3 = (r << 4) + nbits; \
- code = actbl->ehufco[temp3]; \
- size = actbl->ehufsi[temp3]; \
- EMIT_CODE(code, size) \
- r = 0; \
+ r += nbits; \
+ PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \
+ r = 0; \
} \
}
- /* One iteration for each value in jpeg_natural_order[] */
- kloop(1); kloop(8); kloop(16); kloop(9); kloop(2); kloop(3);
- kloop(10); kloop(17); kloop(24); kloop(32); kloop(25); kloop(18);
- kloop(11); kloop(4); kloop(5); kloop(12); kloop(19); kloop(26);
- kloop(33); kloop(40); kloop(48); kloop(41); kloop(34); kloop(27);
- kloop(20); kloop(13); kloop(6); kloop(7); kloop(14); kloop(21);
- kloop(28); kloop(35); kloop(42); kloop(49); kloop(56); kloop(57);
- kloop(50); kloop(43); kloop(36); kloop(29); kloop(22); kloop(15);
- kloop(23); kloop(30); kloop(37); kloop(44); kloop(51); kloop(58);
- kloop(59); kloop(52); kloop(45); kloop(38); kloop(31); kloop(39);
- kloop(46); kloop(53); kloop(60); kloop(61); kloop(54); kloop(47);
- kloop(55); kloop(62); kloop(63);
-
- /* If the last coef(s) were zero, emit an end-of-block code */
- if (r > 0) {
- code = actbl->ehufco[0];
- size = actbl->ehufsi[0];
- EMIT_BITS(code, size)
+ /* One iteration for each value in jpeg_natural_order[] */
+ kloop(1); kloop(8); kloop(16); kloop(9); kloop(2); kloop(3);
+ kloop(10); kloop(17); kloop(24); kloop(32); kloop(25); kloop(18);
+ kloop(11); kloop(4); kloop(5); kloop(12); kloop(19); kloop(26);
+ kloop(33); kloop(40); kloop(48); kloop(41); kloop(34); kloop(27);
+ kloop(20); kloop(13); kloop(6); kloop(7); kloop(14); kloop(21);
+ kloop(28); kloop(35); kloop(42); kloop(49); kloop(56); kloop(57);
+ kloop(50); kloop(43); kloop(36); kloop(29); kloop(22); kloop(15);
+ kloop(23); kloop(30); kloop(37); kloop(44); kloop(51); kloop(58);
+ kloop(59); kloop(52); kloop(45); kloop(38); kloop(31); kloop(39);
+ kloop(46); kloop(53); kloop(60); kloop(61); kloop(54); kloop(47);
+ kloop(55); kloop(62); kloop(63);
+
+ /* If the last coef(s) were zero, emit an end-of-block code */
+ if (r > 0) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
}
- state->cur.put_buffer = put_buffer;
- state->cur.put_bits = put_bits;
+ state->cur.put_buffer.c = put_buffer;
+ state->cur.free_bits = free_bits;
STORE_BUFFER()
return TRUE;
@@ -616,11 +658,11 @@ encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val,
*/
LOCAL(boolean)
-emit_restart (working_state *state, int restart_num)
+emit_restart(working_state *state, int restart_num)
{
int ci;
- if (! flush_bits(state))
+ if (!flush_bits(state))
return FALSE;
emit_byte(state, 0xFF, return FALSE);
@@ -641,9 +683,9 @@ emit_restart (working_state *state, int restart_num)
*/
METHODDEF(boolean)
-encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
working_state state;
int blkn, ci;
jpeg_component_info *compptr;
@@ -651,13 +693,14 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* Load up working state */
state.next_output_byte = cinfo->dest->next_output_byte;
state.free_in_buffer = cinfo->dest->free_in_buffer;
- ASSIGN_STATE(state.cur, entropy->saved);
+ state.cur = entropy->saved;
state.cinfo = cinfo;
+ state.simd = entropy->simd;
/* Emit restart marker if needed */
if (cinfo->restart_interval) {
if (entropy->restarts_to_go == 0)
- if (! emit_restart(&state, entropy->next_restart_num))
+ if (!emit_restart(&state, entropy->next_restart_num))
return FALSE;
}
@@ -666,10 +709,10 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
ci = cinfo->MCU_membership[blkn];
compptr = cinfo->cur_comp_info[ci];
- if (! encode_one_block_simd(&state,
- MCU_data[blkn][0], state.cur.last_dc_val[ci],
- entropy->dc_derived_tbls[compptr->dc_tbl_no],
- entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+ if (!encode_one_block_simd(&state,
+ MCU_data[blkn][0], state.cur.last_dc_val[ci],
+ entropy->dc_derived_tbls[compptr->dc_tbl_no],
+ entropy->ac_derived_tbls[compptr->ac_tbl_no]))
return FALSE;
/* Update last_dc_val */
state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
@@ -678,10 +721,10 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
ci = cinfo->MCU_membership[blkn];
compptr = cinfo->cur_comp_info[ci];
- if (! encode_one_block(&state,
- MCU_data[blkn][0], state.cur.last_dc_val[ci],
- entropy->dc_derived_tbls[compptr->dc_tbl_no],
- entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+ if (!encode_one_block(&state,
+ MCU_data[blkn][0], state.cur.last_dc_val[ci],
+ entropy->dc_derived_tbls[compptr->dc_tbl_no],
+ entropy->ac_derived_tbls[compptr->ac_tbl_no]))
return FALSE;
/* Update last_dc_val */
state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
@@ -691,7 +734,7 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* Completed MCU, so update state */
cinfo->dest->next_output_byte = state.next_output_byte;
cinfo->dest->free_in_buffer = state.free_in_buffer;
- ASSIGN_STATE(entropy->saved, state.cur);
+ entropy->saved = state.cur;
/* Update restart-interval state too */
if (cinfo->restart_interval) {
@@ -712,25 +755,26 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(void)
-finish_pass_huff (j_compress_ptr cinfo)
+finish_pass_huff(j_compress_ptr cinfo)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
working_state state;
/* Load up working state ... flush_bits needs it */
state.next_output_byte = cinfo->dest->next_output_byte;
state.free_in_buffer = cinfo->dest->free_in_buffer;
- ASSIGN_STATE(state.cur, entropy->saved);
+ state.cur = entropy->saved;
state.cinfo = cinfo;
+ state.simd = entropy->simd;
/* Flush out the last data */
- if (! flush_bits(&state))
+ if (!flush_bits(&state))
ERREXIT(cinfo, JERR_CANT_SUSPEND);
/* Update state */
cinfo->dest->next_output_byte = state.next_output_byte;
cinfo->dest->free_in_buffer = state.free_in_buffer;
- ASSIGN_STATE(entropy->saved, state.cur);
+ entropy->saved = state.cur;
}
@@ -751,8 +795,8 @@ finish_pass_huff (j_compress_ptr cinfo)
/* Process a single block's worth of coefficients */
LOCAL(void)
-htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
- long dc_counts[], long ac_counts[])
+htest_one_block(j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
+ long dc_counts[], long ac_counts[])
{
register int temp;
register int nbits;
@@ -773,7 +817,7 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
/* Check for out-of-range coefficient values.
* Since we're encoding a difference, the range limit is twice as much.
*/
- if (nbits > MAX_COEF_BITS+1)
+ if (nbits > MAX_COEF_BITS + 1)
ERREXIT(cinfo, JERR_BAD_DCT_COEF);
/* Count the Huffman symbol for the number of bits */
@@ -824,9 +868,9 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
*/
METHODDEF(boolean)
-encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_gather(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
int blkn, ci;
jpeg_component_info *compptr;
@@ -863,13 +907,14 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
* one bits (so that padding bits added at the end of a compressed segment
* can't look like a valid code). Because of the canonical ordering of
* codewords, this just means that there must be an unused slot in the
- * longest codeword length category. Section K.2 of the JPEG spec suggests
- * reserving such a slot by pretending that symbol 256 is a valid symbol
- * with count 1. In theory that's not optimal; giving it count zero but
- * including it in the symbol set anyway should give a better Huffman code.
- * But the theoretically better code actually seems to come out worse in
- * practice, because it produces more all-ones bytes (which incur stuffed
- * zero bytes in the final file). In any case the difference is tiny.
+ * longest codeword length category. Annex K (Clause K.2) of
+ * Rec. ITU-T T.81 (1992) | ISO/IEC 10918-1:1994 suggests reserving such a slot
+ * by pretending that symbol 256 is a valid symbol with count 1. In theory
+ * that's not optimal; giving it count zero but including it in the symbol set
+ * anyway should give a better Huffman code. But the theoretically better code
+ * actually seems to come out worse in practice, because it produces more
+ * all-ones bytes (which incur stuffed zero bytes in the final file). In any
+ * case the difference is tiny.
*
* The JPEG standard requires Huffman codes to be no more than 16 bits long.
* If some symbols have a very small but nonzero probability, the Huffman tree
@@ -884,10 +929,10 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
GLOBAL(void)
-jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
+jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
{
-#define MAX_CLEN 32 /* assumed maximum initial code length */
- UINT8 bits[MAX_CLEN+1]; /* bits[k] = # of symbols with code length k */
+#define MAX_CLEN 32 /* assumed maximum initial code length */
+ UINT8 bits[MAX_CLEN + 1]; /* bits[k] = # of symbols with code length k */
int codesize[257]; /* codesize[k] = code length of symbol k */
int others[257]; /* next symbol in current branch of tree */
int c1, c2;
@@ -896,8 +941,8 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
/* This algorithm is explained in section K.2 of the JPEG standard */
- MEMZERO(bits, sizeof(bits));
- MEMZERO(codesize, sizeof(codesize));
+ memset(bits, 0, sizeof(bits));
+ memset(codesize, 0, sizeof(codesize));
for (i = 0; i < 257; i++)
others[i] = -1; /* init links to empty */
@@ -971,13 +1016,13 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
/* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure
* Huffman procedure assigned any such lengths, we must adjust the coding.
- * Here is what the JPEG spec says about how this next bit works:
- * Since symbols are paired for the longest Huffman code, the symbols are
- * removed from this length category two at a time. The prefix for the pair
- * (which is one bit shorter) is allocated to one of the pair; then,
- * skipping the BITS entry for that prefix length, a code word from the next
- * shortest nonzero BITS entry is converted into a prefix for two code words
- * one bit longer.
+ * Here is what Rec. ITU-T T.81 | ISO/IEC 10918-1 says about how this next
+ * bit works: Since symbols are paired for the longest Huffman code, the
+ * symbols are removed from this length category two at a time. The prefix
+ * for the pair (which is one bit shorter) is allocated to one of the pair;
+ * then, skipping the BITS entry for that prefix length, a code word from the
+ * next shortest nonzero BITS entry is converted into a prefix for two code
+ * words one bit longer.
*/
for (i = MAX_CLEN; i > 16; i--) {
@@ -987,8 +1032,8 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
j--;
bits[i] -= 2; /* remove two symbols */
- bits[i-1]++; /* one goes in this length */
- bits[j+1] += 2; /* two new symbols in this length */
+ bits[i - 1]++; /* one goes in this length */
+ bits[j + 1] += 2; /* two new symbols in this length */
bits[j]--; /* symbol of this length is now a prefix */
}
}
@@ -999,17 +1044,18 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
bits[i]--;
/* Return final symbol counts (only for lengths 0..16) */
- MEMCOPY(htbl->bits, bits, sizeof(htbl->bits));
+ memcpy(htbl->bits, bits, sizeof(htbl->bits));
/* Return a list of the symbols sorted by code length */
/* It's not real clear to me why we don't need to consider the codelength
- * changes made above, but the JPEG spec seems to think this works.
+ * changes made above, but Rec. ITU-T T.81 | ISO/IEC 10918-1 seems to think
+ * this works.
*/
p = 0;
for (i = 1; i <= MAX_CLEN; i++) {
for (j = 0; j <= 255; j++) {
if (codesize[j] == i) {
- htbl->huffval[p] = (UINT8) j;
+ htbl->huffval[p] = (UINT8)j;
p++;
}
}
@@ -1025,9 +1071,9 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
*/
METHODDEF(void)
-finish_pass_gather (j_compress_ptr cinfo)
+finish_pass_gather(j_compress_ptr cinfo)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
int ci, dctbl, actbl;
jpeg_component_info *compptr;
JHUFF_TBL **htblptr;
@@ -1037,24 +1083,24 @@ finish_pass_gather (j_compress_ptr cinfo)
/* It's important not to apply jpeg_gen_optimal_table more than once
* per table, because it clobbers the input frequency counts!
*/
- MEMZERO(did_dc, sizeof(did_dc));
- MEMZERO(did_ac, sizeof(did_ac));
+ memset(did_dc, 0, sizeof(did_dc));
+ memset(did_ac, 0, sizeof(did_ac));
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
dctbl = compptr->dc_tbl_no;
actbl = compptr->ac_tbl_no;
- if (! did_dc[dctbl]) {
- htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
+ if (!did_dc[dctbl]) {
+ htblptr = &cinfo->dc_huff_tbl_ptrs[dctbl];
if (*htblptr == NULL)
- *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
did_dc[dctbl] = TRUE;
}
- if (! did_ac[actbl]) {
- htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
+ if (!did_ac[actbl]) {
+ htblptr = &cinfo->ac_huff_tbl_ptrs[actbl];
if (*htblptr == NULL)
- *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
did_ac[actbl] = TRUE;
}
@@ -1070,15 +1116,15 @@ finish_pass_gather (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jinit_huff_encoder (j_compress_ptr cinfo)
+jinit_huff_encoder(j_compress_ptr cinfo)
{
huff_entropy_ptr entropy;
int i;
entropy = (huff_entropy_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(huff_entropy_encoder));
- cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+ cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
entropy->pub.start_pass = start_pass_huff;
/* Mark tables unallocated */
diff --git a/media/libjpeg/jchuff.h b/media/libjpeg/jchuff.h
index 4236089adc..314a2325c9 100644
--- a/media/libjpeg/jchuff.h
+++ b/media/libjpeg/jchuff.h
@@ -20,9 +20,9 @@
*/
#if BITS_IN_JSAMPLE == 8
-#define MAX_COEF_BITS 10
+#define MAX_COEF_BITS 10
#else
-#define MAX_COEF_BITS 14
+#define MAX_COEF_BITS 14
#endif
/* Derived data constructed for each Huffman table */
@@ -34,10 +34,9 @@ typedef struct {
} c_derived_tbl;
/* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_c_derived_tbl
- (j_compress_ptr cinfo, boolean isDC, int tblno,
- c_derived_tbl ** pdtbl);
+EXTERN(void) jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC,
+ int tblno, c_derived_tbl **pdtbl);
/* Generate an optimal table definition given the specified counts */
-EXTERN(void) jpeg_gen_optimal_table
- (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]);
+EXTERN(void) jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl,
+ long freq[]);
diff --git a/media/libjpeg/jcicc.c b/media/libjpeg/jcicc.c
new file mode 100644
index 0000000000..11037ff694
--- /dev/null
+++ b/media/libjpeg/jcicc.c
@@ -0,0 +1,105 @@
+/*
+ * jcicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to write International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files. The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers. The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to embed the profile data in a JPEG file while writing it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+/*
+ * Since an ICC profile can be larger than the maximum size of a JPEG marker
+ * (64K), we need provisions to split it into multiple markers. The format
+ * defined by the ICC specifies one or more APP2 markers containing the
+ * following data:
+ * Identifying string ASCII "ICC_PROFILE\0" (12 bytes)
+ * Marker sequence number 1 for first APP2, 2 for next, etc (1 byte)
+ * Number of markers Total number of APP2's used (1 byte)
+ * Profile data (remainder of APP2 data)
+ * Decoders should use the marker sequence numbers to reassemble the profile,
+ * rather than assuming that the APP2 markers appear in the correct sequence.
+ */
+
+#define ICC_MARKER (JPEG_APP0 + 2) /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN 14 /* size of non-profile data in APP2 */
+#define MAX_BYTES_IN_MARKER 65533 /* maximum data len of a JPEG marker */
+#define MAX_DATA_BYTES_IN_MARKER (MAX_BYTES_IN_MARKER - ICC_OVERHEAD_LEN)
+
+
+/*
+ * This routine writes the given ICC profile data into a JPEG file. It *must*
+ * be called AFTER calling jpeg_start_compress() and BEFORE the first call to
+ * jpeg_write_scanlines(). (This ordering ensures that the APP2 marker(s) will
+ * appear after the SOI and JFIF or Adobe markers, but before all else.)
+ */
+
+GLOBAL(void)
+jpeg_write_icc_profile(j_compress_ptr cinfo, const JOCTET *icc_data_ptr,
+ unsigned int icc_data_len)
+{
+ unsigned int num_markers; /* total number of markers we'll write */
+ int cur_marker = 1; /* per spec, counting starts at 1 */
+ unsigned int length; /* number of bytes to write in this marker */
+
+ if (icc_data_ptr == NULL || icc_data_len == 0)
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+ if (cinfo->global_state < CSTATE_SCANNING)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ /* Calculate the number of markers we'll need, rounding up of course */
+ num_markers = icc_data_len / MAX_DATA_BYTES_IN_MARKER;
+ if (num_markers * MAX_DATA_BYTES_IN_MARKER != icc_data_len)
+ num_markers++;
+
+ while (icc_data_len > 0) {
+ /* length of profile to put in this marker */
+ length = icc_data_len;
+ if (length > MAX_DATA_BYTES_IN_MARKER)
+ length = MAX_DATA_BYTES_IN_MARKER;
+ icc_data_len -= length;
+
+ /* Write the JPEG marker header (APP2 code and marker length) */
+ jpeg_write_m_header(cinfo, ICC_MARKER,
+ (unsigned int)(length + ICC_OVERHEAD_LEN));
+
+ /* Write the marker identifying string "ICC_PROFILE" (null-terminated). We
+ * code it in this less-than-transparent way so that the code works even if
+ * the local character set is not ASCII.
+ */
+ jpeg_write_m_byte(cinfo, 0x49);
+ jpeg_write_m_byte(cinfo, 0x43);
+ jpeg_write_m_byte(cinfo, 0x43);
+ jpeg_write_m_byte(cinfo, 0x5F);
+ jpeg_write_m_byte(cinfo, 0x50);
+ jpeg_write_m_byte(cinfo, 0x52);
+ jpeg_write_m_byte(cinfo, 0x4F);
+ jpeg_write_m_byte(cinfo, 0x46);
+ jpeg_write_m_byte(cinfo, 0x49);
+ jpeg_write_m_byte(cinfo, 0x4C);
+ jpeg_write_m_byte(cinfo, 0x45);
+ jpeg_write_m_byte(cinfo, 0x0);
+
+ /* Add the sequencing info */
+ jpeg_write_m_byte(cinfo, cur_marker);
+ jpeg_write_m_byte(cinfo, (int)num_markers);
+
+ /* Add the profile data */
+ while (length--) {
+ jpeg_write_m_byte(cinfo, *icc_data_ptr);
+ icc_data_ptr++;
+ }
+ cur_marker++;
+ }
+}
diff --git a/media/libjpeg/jcinit.c b/media/libjpeg/jcinit.c
index 463bd8c6dd..157353a22e 100644
--- a/media/libjpeg/jcinit.c
+++ b/media/libjpeg/jcinit.c
@@ -1,8 +1,10 @@
/*
* jcinit.c
*
+ * This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -19,6 +21,7 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
+#include "jpegcomp.h"
/*
@@ -28,13 +31,13 @@
*/
GLOBAL(void)
-jinit_compress_master (j_compress_ptr cinfo)
+jinit_compress_master(j_compress_ptr cinfo)
{
/* Initialize master control (includes parameter checking/processing) */
jinit_c_master_control(cinfo, FALSE /* full compression */);
/* Preprocessing */
- if (! cinfo->raw_data_in) {
+ if (!cinfo->raw_data_in) {
jinit_color_converter(cinfo);
jinit_downsampler(cinfo);
jinit_c_prep_controller(cinfo, FALSE /* never need full buffer here */);
@@ -60,14 +63,14 @@ jinit_compress_master (j_compress_ptr cinfo)
}
/* Need a full-image coefficient buffer in any multi-pass mode. */
- jinit_c_coef_controller(cinfo,
- (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding));
+ jinit_c_coef_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+ cinfo->optimize_coding));
jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
jinit_marker_writer(cinfo);
/* We can now tell the memory manager to allocate virtual arrays. */
- (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+ (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
/* Write the datastream header (SOI) immediately.
* Frame and scan headers are postponed till later.
diff --git a/media/libjpeg/jcmainct.c b/media/libjpeg/jcmainct.c
index d01f46364b..3f23028c46 100644
--- a/media/libjpeg/jcmainct.c
+++ b/media/libjpeg/jcmainct.c
@@ -39,9 +39,10 @@ typedef my_main_controller *my_main_ptr;
/* Forward declarations */
-METHODDEF(void) process_data_simple_main
- (j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
- JDIMENSION in_rows_avail);
+METHODDEF(void) process_data_simple_main(j_compress_ptr cinfo,
+ JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr,
+ JDIMENSION in_rows_avail);
/*
@@ -49,9 +50,9 @@ METHODDEF(void) process_data_simple_main
*/
METHODDEF(void)
-start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_main(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
/* Do nothing in raw-data mode. */
if (cinfo->raw_data_in)
@@ -75,19 +76,18 @@ start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
*/
METHODDEF(void)
-process_data_simple_main (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
- JDIMENSION in_rows_avail)
+process_data_simple_main(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
/* Read input data if we haven't filled the main buffer yet */
if (main_ptr->rowgroup_ctr < DCTSIZE)
- (*cinfo->prep->pre_process_data) (cinfo,
- input_buf, in_row_ctr, in_rows_avail,
- main_ptr->buffer, &main_ptr->rowgroup_ctr,
- (JDIMENSION) DCTSIZE);
+ (*cinfo->prep->pre_process_data) (cinfo, input_buf, in_row_ctr,
+ in_rows_avail, main_ptr->buffer,
+ &main_ptr->rowgroup_ctr,
+ (JDIMENSION)DCTSIZE);
/* If we don't have a full iMCU row buffered, return to application for
* more data. Note that preprocessor will always pad to fill the iMCU row
@@ -97,14 +97,14 @@ process_data_simple_main (j_compress_ptr cinfo,
return;
/* Send the completed row to the compressor */
- if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+ if (!(*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
/* If compressor did not consume the whole row, then we must need to
* suspend processing and return to the application. In this situation
* we pretend we didn't yet consume the last input row; otherwise, if
* it happened to be the last row of the image, the application would
* think we were done.
*/
- if (! main_ptr->suspended) {
+ if (!main_ptr->suspended) {
(*in_row_ctr)--;
main_ptr->suspended = TRUE;
}
@@ -128,16 +128,16 @@ process_data_simple_main (j_compress_ptr cinfo,
*/
GLOBAL(void)
-jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_main_controller(j_compress_ptr cinfo, boolean need_full_buffer)
{
my_main_ptr main_ptr;
int ci;
jpeg_component_info *compptr;
main_ptr = (my_main_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_main_controller));
- cinfo->main = (struct jpeg_c_main_controller *) main_ptr;
+ cinfo->main = (struct jpeg_c_main_controller *)main_ptr;
main_ptr->pub.start_pass = start_pass_main;
/* We don't need to create a buffer in raw-data mode. */
@@ -154,9 +154,9 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
compptr->width_in_blocks * DCTSIZE,
- (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+ (JDIMENSION)(compptr->v_samp_factor * DCTSIZE));
}
}
}
diff --git a/media/libjpeg/jcmarker.c b/media/libjpeg/jcmarker.c
index 463f665927..801fbab4ef 100644
--- a/media/libjpeg/jcmarker.c
+++ b/media/libjpeg/jcmarker.c
@@ -110,30 +110,30 @@ typedef my_marker_writer *my_marker_ptr;
*/
LOCAL(void)
-emit_byte (j_compress_ptr cinfo, int val)
+emit_byte(j_compress_ptr cinfo, int val)
/* Emit a byte */
{
struct jpeg_destination_mgr *dest = cinfo->dest;
- *(dest->next_output_byte)++ = (JOCTET) val;
+ *(dest->next_output_byte)++ = (JOCTET)val;
if (--dest->free_in_buffer == 0) {
- if (! (*dest->empty_output_buffer) (cinfo))
+ if (!(*dest->empty_output_buffer) (cinfo))
ERREXIT(cinfo, JERR_CANT_SUSPEND);
}
}
LOCAL(void)
-emit_marker (j_compress_ptr cinfo, JPEG_MARKER mark)
+emit_marker(j_compress_ptr cinfo, JPEG_MARKER mark)
/* Emit a marker code */
{
emit_byte(cinfo, 0xFF);
- emit_byte(cinfo, (int) mark);
+ emit_byte(cinfo, (int)mark);
}
LOCAL(void)
-emit_2bytes (j_compress_ptr cinfo, int value)
+emit_2bytes(j_compress_ptr cinfo, int value)
/* Emit a 2-byte integer; these are always MSB first in JPEG files */
{
emit_byte(cinfo, (value >> 8) & 0xFF);
@@ -146,7 +146,7 @@ emit_2bytes (j_compress_ptr cinfo, int value)
*/
LOCAL(int)
-emit_dqt (j_compress_ptr cinfo, int index)
+emit_dqt(j_compress_ptr cinfo, int index)
/* Emit a DQT marker */
/* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */
{
@@ -163,19 +163,19 @@ emit_dqt (j_compress_ptr cinfo, int index)
prec = 1;
}
- if (! qtbl->sent_table) {
+ if (!qtbl->sent_table) {
emit_marker(cinfo, M_DQT);
- emit_2bytes(cinfo, prec ? DCTSIZE2*2 + 1 + 2 : DCTSIZE2 + 1 + 2);
+ emit_2bytes(cinfo, prec ? DCTSIZE2 * 2 + 1 + 2 : DCTSIZE2 + 1 + 2);
- emit_byte(cinfo, index + (prec<<4));
+ emit_byte(cinfo, index + (prec << 4));
for (i = 0; i < DCTSIZE2; i++) {
/* The table entries must be emitted in zigzag order. */
unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
if (prec)
- emit_byte(cinfo, (int) (qval >> 8));
- emit_byte(cinfo, (int) (qval & 0xFF));
+ emit_byte(cinfo, (int)(qval >> 8));
+ emit_byte(cinfo, (int)(qval & 0xFF));
}
qtbl->sent_table = TRUE;
@@ -186,7 +186,7 @@ emit_dqt (j_compress_ptr cinfo, int index)
LOCAL(void)
-emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
+emit_dht(j_compress_ptr cinfo, int index, boolean is_ac)
/* Emit a DHT marker */
{
JHUFF_TBL *htbl;
@@ -202,7 +202,7 @@ emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
if (htbl == NULL)
ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, index);
- if (! htbl->sent_table) {
+ if (!htbl->sent_table) {
emit_marker(cinfo, M_DHT);
length = 0;
@@ -224,7 +224,7 @@ emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
LOCAL(void)
-emit_dac (j_compress_ptr cinfo)
+emit_dac(j_compress_ptr cinfo)
/* Emit a DAC marker */
/* Since the useful info is so small, we want to emit all the tables in */
/* one DAC marker. Therefore this routine does its own scan of the table. */
@@ -255,12 +255,12 @@ emit_dac (j_compress_ptr cinfo)
if (length) {
emit_marker(cinfo, M_DAC);
- emit_2bytes(cinfo, length*2 + 2);
+ emit_2bytes(cinfo, length * 2 + 2);
for (i = 0; i < NUM_ARITH_TBLS; i++) {
if (dc_in_use[i]) {
emit_byte(cinfo, i);
- emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4));
+ emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i] << 4));
}
if (ac_in_use[i]) {
emit_byte(cinfo, i + 0x10);
@@ -273,19 +273,19 @@ emit_dac (j_compress_ptr cinfo)
LOCAL(void)
-emit_dri (j_compress_ptr cinfo)
+emit_dri(j_compress_ptr cinfo)
/* Emit a DRI marker */
{
emit_marker(cinfo, M_DRI);
emit_2bytes(cinfo, 4); /* fixed length */
- emit_2bytes(cinfo, (int) cinfo->restart_interval);
+ emit_2bytes(cinfo, (int)cinfo->restart_interval);
}
LOCAL(void)
-emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
+emit_sof(j_compress_ptr cinfo, JPEG_MARKER code)
/* Emit a SOF marker */
{
int ci;
@@ -296,13 +296,12 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
/* Make sure image isn't bigger than SOF field can handle */
- if ((long) cinfo->_jpeg_height > 65535L ||
- (long) cinfo->_jpeg_width > 65535L)
- ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);
+ if ((long)cinfo->_jpeg_height > 65535L || (long)cinfo->_jpeg_width > 65535L)
+ ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)65535);
emit_byte(cinfo, cinfo->data_precision);
- emit_2bytes(cinfo, (int) cinfo->_jpeg_height);
- emit_2bytes(cinfo, (int) cinfo->_jpeg_width);
+ emit_2bytes(cinfo, (int)cinfo->_jpeg_height);
+ emit_2bytes(cinfo, (int)cinfo->_jpeg_width);
emit_byte(cinfo, cinfo->num_components);
@@ -316,7 +315,7 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
LOCAL(void)
-emit_sos (j_compress_ptr cinfo)
+emit_sos(j_compress_ptr cinfo)
/* Emit a SOS marker */
{
int i, td, ta;
@@ -351,7 +350,7 @@ emit_sos (j_compress_ptr cinfo)
LOCAL(void)
-emit_jfif_app0 (j_compress_ptr cinfo)
+emit_jfif_app0(j_compress_ptr cinfo)
/* Emit a JFIF-compliant APP0 marker */
{
/*
@@ -378,15 +377,15 @@ emit_jfif_app0 (j_compress_ptr cinfo)
emit_byte(cinfo, cinfo->JFIF_major_version); /* Version fields */
emit_byte(cinfo, cinfo->JFIF_minor_version);
emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */
- emit_2bytes(cinfo, (int) cinfo->X_density);
- emit_2bytes(cinfo, (int) cinfo->Y_density);
+ emit_2bytes(cinfo, (int)cinfo->X_density);
+ emit_2bytes(cinfo, (int)cinfo->Y_density);
emit_byte(cinfo, 0); /* No thumbnail image */
emit_byte(cinfo, 0);
}
LOCAL(void)
-emit_adobe_app14 (j_compress_ptr cinfo)
+emit_adobe_app14(j_compress_ptr cinfo)
/* Emit an Adobe APP14 marker */
{
/*
@@ -440,19 +439,19 @@ emit_adobe_app14 (j_compress_ptr cinfo)
*/
METHODDEF(void)
-write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
+write_marker_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
/* Emit an arbitrary marker header */
{
- if (datalen > (unsigned int) 65533) /* safety check */
+ if (datalen > (unsigned int)65533) /* safety check */
ERREXIT(cinfo, JERR_BAD_LENGTH);
- emit_marker(cinfo, (JPEG_MARKER) marker);
+ emit_marker(cinfo, (JPEG_MARKER)marker);
- emit_2bytes(cinfo, (int) (datalen + 2)); /* total length */
+ emit_2bytes(cinfo, (int)(datalen + 2)); /* total length */
}
METHODDEF(void)
-write_marker_byte (j_compress_ptr cinfo, int val)
+write_marker_byte(j_compress_ptr cinfo, int val)
/* Emit one byte of marker parameters following write_marker_header */
{
emit_byte(cinfo, val);
@@ -471,9 +470,9 @@ write_marker_byte (j_compress_ptr cinfo, int val)
*/
METHODDEF(void)
-write_file_header (j_compress_ptr cinfo)
+write_file_header(j_compress_ptr cinfo)
{
- my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
emit_marker(cinfo, M_SOI); /* first the SOI */
@@ -496,7 +495,7 @@ write_file_header (j_compress_ptr cinfo)
*/
METHODDEF(void)
-write_frame_header (j_compress_ptr cinfo)
+write_frame_header(j_compress_ptr cinfo)
{
int ci, prec;
boolean is_baseline;
@@ -556,9 +555,9 @@ write_frame_header (j_compress_ptr cinfo)
*/
METHODDEF(void)
-write_scan_header (j_compress_ptr cinfo)
+write_scan_header(j_compress_ptr cinfo)
{
- my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
int i;
jpeg_component_info *compptr;
@@ -600,7 +599,7 @@ write_scan_header (j_compress_ptr cinfo)
*/
METHODDEF(void)
-write_file_trailer (j_compress_ptr cinfo)
+write_file_trailer(j_compress_ptr cinfo)
{
emit_marker(cinfo, M_EOI);
}
@@ -614,7 +613,7 @@ write_file_trailer (j_compress_ptr cinfo)
*/
METHODDEF(void)
-write_tables_only (j_compress_ptr cinfo)
+write_tables_only(j_compress_ptr cinfo)
{
int i;
@@ -622,10 +621,10 @@ write_tables_only (j_compress_ptr cinfo)
for (i = 0; i < NUM_QUANT_TBLS; i++) {
if (cinfo->quant_tbl_ptrs[i] != NULL)
- (void) emit_dqt(cinfo, i);
+ (void)emit_dqt(cinfo, i);
}
- if (! cinfo->arith_code) {
+ if (!cinfo->arith_code) {
for (i = 0; i < NUM_HUFF_TBLS; i++) {
if (cinfo->dc_huff_tbl_ptrs[i] != NULL)
emit_dht(cinfo, i, FALSE);
@@ -643,15 +642,15 @@ write_tables_only (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jinit_marker_writer (j_compress_ptr cinfo)
+jinit_marker_writer(j_compress_ptr cinfo)
{
my_marker_ptr marker;
/* Create the subobject */
marker = (my_marker_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_marker_writer));
- cinfo->marker = (struct jpeg_marker_writer *) marker;
+ cinfo->marker = (struct jpeg_marker_writer *)marker;
/* Initialize method pointers */
marker->pub.write_file_header = write_file_header;
marker->pub.write_frame_header = write_frame_header;
diff --git a/media/libjpeg/jcmaster.c b/media/libjpeg/jcmaster.c
index 03a8b40ea9..c2b2600031 100644
--- a/media/libjpeg/jcmaster.c
+++ b/media/libjpeg/jcmaster.c
@@ -5,7 +5,7 @@
* Copyright (C) 1991-1997, Thomas G. Lane.
* Modified 2003-2010 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2016, D. R. Commander.
+ * Copyright (C) 2010, 2016, 2018, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -25,9 +25,9 @@
/* Private state */
typedef enum {
- main_pass, /* input data, also do first output step */
- huff_opt_pass, /* Huffman code optimization pass */
- output_pass /* data output pass */
+ main_pass, /* input data, also do first output step */
+ huff_opt_pass, /* Huffman code optimization pass */
+ output_pass /* data output pass */
} c_pass_type;
typedef struct {
@@ -66,7 +66,7 @@ typedef my_comp_master *my_master_ptr;
*/
GLOBAL(void)
-jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo)
/* Do computations that are needed before master selection phase */
{
/* Hardwire it to "no scaling" */
@@ -79,7 +79,7 @@ jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
LOCAL(void)
-initial_setup (j_compress_ptr cinfo, boolean transcode_only)
+initial_setup(j_compress_ptr cinfo, boolean transcode_only)
/* Do computations that are needed before master selection phase */
{
int ci;
@@ -95,19 +95,19 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
#endif
/* Sanity check on image dimensions */
- if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0
- || cinfo->num_components <= 0 || cinfo->input_components <= 0)
+ if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0 ||
+ cinfo->num_components <= 0 || cinfo->input_components <= 0)
ERREXIT(cinfo, JERR_EMPTY_IMAGE);
/* Make sure image isn't bigger than I can handle */
- if ((long) cinfo->_jpeg_height > (long) JPEG_MAX_DIMENSION ||
- (long) cinfo->_jpeg_width > (long) JPEG_MAX_DIMENSION)
- ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+ if ((long)cinfo->_jpeg_height > (long)JPEG_MAX_DIMENSION ||
+ (long)cinfo->_jpeg_width > (long)JPEG_MAX_DIMENSION)
+ ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
/* Width of an input scanline must be representable as JDIMENSION. */
- samplesperrow = (long) cinfo->image_width * (long) cinfo->input_components;
- jd_samplesperrow = (JDIMENSION) samplesperrow;
- if ((long) jd_samplesperrow != samplesperrow)
+ samplesperrow = (long)cinfo->image_width * (long)cinfo->input_components;
+ jd_samplesperrow = (JDIMENSION)samplesperrow;
+ if ((long)jd_samplesperrow != samplesperrow)
ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
/* For now, precision must match compiled-in value... */
@@ -124,8 +124,10 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
cinfo->max_v_samp_factor = 1;
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
- compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+ if (compptr->h_samp_factor <= 0 ||
+ compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+ compptr->v_samp_factor <= 0 ||
+ compptr->v_samp_factor > MAX_SAMP_FACTOR)
ERREXIT(cinfo, JERR_BAD_SAMPLING);
cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
compptr->h_samp_factor);
@@ -146,18 +148,18 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
#endif
/* Size in DCT blocks */
compptr->width_in_blocks = (JDIMENSION)
- jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
- (long) (cinfo->max_h_samp_factor * DCTSIZE));
+ jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
compptr->height_in_blocks = (JDIMENSION)
- jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
- (long) (cinfo->max_v_samp_factor * DCTSIZE));
+ jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
/* Size in samples */
compptr->downsampled_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
- (long) cinfo->max_h_samp_factor);
+ jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+ (long)cinfo->max_h_samp_factor);
compptr->downsampled_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
- (long) cinfo->max_v_samp_factor);
+ jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+ (long)cinfo->max_v_samp_factor);
/* Mark component needed (this flag isn't actually used for compression) */
compptr->component_needed = TRUE;
}
@@ -166,15 +168,15 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
* main controller will call coefficient controller).
*/
cinfo->total_iMCU_rows = (JDIMENSION)
- jdiv_round_up((long) cinfo->_jpeg_height,
- (long) (cinfo->max_v_samp_factor*DCTSIZE));
+ jdiv_round_up((long)cinfo->_jpeg_height,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
}
#ifdef C_MULTISCAN_FILES_SUPPORTED
LOCAL(void)
-validate_script (j_compress_ptr cinfo)
+validate_script(j_compress_ptr cinfo)
/* Verify that the scan script in cinfo->scan_info[] is valid; also
* determine whether it uses progressive JPEG, and set cinfo->progressive_mode.
*/
@@ -196,10 +198,10 @@ validate_script (j_compress_ptr cinfo)
* for progressive JPEG, no scan can have this.
*/
scanptr = cinfo->scan_info;
- if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2-1) {
+ if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2 - 1) {
#ifdef C_PROGRESSIVE_SUPPORTED
cinfo->progressive_mode = TRUE;
- last_bitpos_ptr = & last_bitpos[0][0];
+ last_bitpos_ptr = &last_bitpos[0][0];
for (ci = 0; ci < cinfo->num_components; ci++)
for (coefi = 0; coefi < DCTSIZE2; coefi++)
*last_bitpos_ptr++ = -1;
@@ -222,7 +224,7 @@ validate_script (j_compress_ptr cinfo)
if (thisi < 0 || thisi >= cinfo->num_components)
ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
/* Components must appear in SOF order within each scan */
- if (ci > 0 && thisi <= scanptr->component_index[ci-1])
+ if (ci > 0 && thisi <= scanptr->component_index[ci - 1])
ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
}
/* Validate progression parameters */
@@ -232,17 +234,17 @@ validate_script (j_compress_ptr cinfo)
Al = scanptr->Al;
if (cinfo->progressive_mode) {
#ifdef C_PROGRESSIVE_SUPPORTED
- /* The JPEG spec simply gives the ranges 0..13 for Ah and Al, but that
- * seems wrong: the upper bound ought to depend on data precision.
- * Perhaps they really meant 0..N+1 for N-bit precision.
+ /* Rec. ITU-T T.81 | ISO/IEC 10918-1 simply gives the ranges 0..13 for Ah
+ * and Al, but that seems wrong: the upper bound ought to depend on data
+ * precision. Perhaps they really meant 0..N+1 for N-bit precision.
* Here we allow 0..10 for 8-bit data; Al larger than 10 results in
* out-of-range reconstructed DC values during the first DC scan,
* which might cause problems for some decoders.
*/
#if BITS_IN_JSAMPLE == 8
-#define MAX_AH_AL 10
+#define MAX_AH_AL 10
#else
-#define MAX_AH_AL 13
+#define MAX_AH_AL 13
#endif
if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 ||
Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
@@ -255,7 +257,7 @@ validate_script (j_compress_ptr cinfo)
ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
}
for (ci = 0; ci < ncomps; ci++) {
- last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0];
+ last_bitpos_ptr = &last_bitpos[scanptr->component_index[ci]][0];
if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */
ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
for (coefi = Ss; coefi <= Se; coefi++) {
@@ -265,7 +267,7 @@ validate_script (j_compress_ptr cinfo)
ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
} else {
/* not first scan */
- if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1)
+ if (Ah != last_bitpos_ptr[coefi] || Al != Ah - 1)
ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
}
last_bitpos_ptr[coefi] = Al;
@@ -274,7 +276,7 @@ validate_script (j_compress_ptr cinfo)
#endif
} else {
/* For sequential JPEG, all progression parameters must be these: */
- if (Ss != 0 || Se != DCTSIZE2-1 || Ah != 0 || Al != 0)
+ if (Ss != 0 || Se != DCTSIZE2 - 1 || Ah != 0 || Al != 0)
ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
/* Make sure components are not sent twice */
for (ci = 0; ci < ncomps; ci++) {
@@ -301,7 +303,7 @@ validate_script (j_compress_ptr cinfo)
#endif
} else {
for (ci = 0; ci < cinfo->num_components; ci++) {
- if (! component_sent[ci])
+ if (!component_sent[ci])
ERREXIT(cinfo, JERR_MISSING_DATA);
}
}
@@ -311,7 +313,7 @@ validate_script (j_compress_ptr cinfo)
LOCAL(void)
-select_scan_parameters (j_compress_ptr cinfo)
+select_scan_parameters(j_compress_ptr cinfo)
/* Set up the scan parameters for the current scan */
{
int ci;
@@ -319,7 +321,7 @@ select_scan_parameters (j_compress_ptr cinfo)
#ifdef C_MULTISCAN_FILES_SUPPORTED
if (cinfo->scan_info != NULL) {
/* Prepare for current scan --- the script is already validated */
- my_master_ptr master = (my_master_ptr) cinfo->master;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
const jpeg_scan_info *scanptr = cinfo->scan_info + master->scan_number;
cinfo->comps_in_scan = scanptr->comps_in_scan;
@@ -331,8 +333,7 @@ select_scan_parameters (j_compress_ptr cinfo)
cinfo->Se = scanptr->Se;
cinfo->Ah = scanptr->Ah;
cinfo->Al = scanptr->Al;
- }
- else
+ } else
#endif
{
/* Prepare for single sequential-JPEG scan containing all components */
@@ -344,7 +345,7 @@ select_scan_parameters (j_compress_ptr cinfo)
cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
}
cinfo->Ss = 0;
- cinfo->Se = DCTSIZE2-1;
+ cinfo->Se = DCTSIZE2 - 1;
cinfo->Ah = 0;
cinfo->Al = 0;
}
@@ -352,7 +353,7 @@ select_scan_parameters (j_compress_ptr cinfo)
LOCAL(void)
-per_scan_setup (j_compress_ptr cinfo)
+per_scan_setup(j_compress_ptr cinfo)
/* Do computations that are needed before processing a JPEG scan */
/* cinfo->comps_in_scan and cinfo->cur_comp_info[] are already set */
{
@@ -377,7 +378,7 @@ per_scan_setup (j_compress_ptr cinfo)
/* For noninterleaved scans, it is convenient to define last_row_height
* as the number of block rows present in the last iMCU row.
*/
- tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+ tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
if (tmp == 0) tmp = compptr->v_samp_factor;
compptr->last_row_height = tmp;
@@ -394,11 +395,11 @@ per_scan_setup (j_compress_ptr cinfo)
/* Overall image size in MCUs */
cinfo->MCUs_per_row = (JDIMENSION)
- jdiv_round_up((long) cinfo->_jpeg_width,
- (long) (cinfo->max_h_samp_factor*DCTSIZE));
+ jdiv_round_up((long)cinfo->_jpeg_width,
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
cinfo->MCU_rows_in_scan = (JDIMENSION)
- jdiv_round_up((long) cinfo->_jpeg_height,
- (long) (cinfo->max_v_samp_factor*DCTSIZE));
+ jdiv_round_up((long)cinfo->_jpeg_height,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
cinfo->blocks_in_MCU = 0;
@@ -410,10 +411,10 @@ per_scan_setup (j_compress_ptr cinfo)
compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
/* Figure number of non-dummy blocks in last MCU column & row */
- tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
+ tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
if (tmp == 0) tmp = compptr->MCU_width;
compptr->last_col_width = tmp;
- tmp = (int) (compptr->height_in_blocks % compptr->MCU_height);
+ tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
if (tmp == 0) tmp = compptr->MCU_height;
compptr->last_row_height = tmp;
/* Prepare array describing MCU composition */
@@ -430,8 +431,8 @@ per_scan_setup (j_compress_ptr cinfo)
/* Convert restart specified in rows to actual MCU count. */
/* Note that count must fit in 16 bits, so we provide limiting. */
if (cinfo->restart_in_rows > 0) {
- long nominal = (long) cinfo->restart_in_rows * (long) cinfo->MCUs_per_row;
- cinfo->restart_interval = (unsigned int) MIN(nominal, 65535L);
+ long nominal = (long)cinfo->restart_in_rows * (long)cinfo->MCUs_per_row;
+ cinfo->restart_interval = (unsigned int)MIN(nominal, 65535L);
}
}
@@ -445,9 +446,9 @@ per_scan_setup (j_compress_ptr cinfo)
*/
METHODDEF(void)
-prepare_for_pass (j_compress_ptr cinfo)
+prepare_for_pass(j_compress_ptr cinfo)
{
- my_master_ptr master = (my_master_ptr) cinfo->master;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
switch (master->pass_type) {
case main_pass:
@@ -456,7 +457,7 @@ prepare_for_pass (j_compress_ptr cinfo)
*/
select_scan_parameters(cinfo);
per_scan_setup(cinfo);
- if (! cinfo->raw_data_in) {
+ if (!cinfo->raw_data_in) {
(*cinfo->cconvert->start_pass) (cinfo);
(*cinfo->downsample->start_pass) (cinfo);
(*cinfo->prep->start_pass) (cinfo, JBUF_PASS_THRU);
@@ -491,12 +492,12 @@ prepare_for_pass (j_compress_ptr cinfo)
*/
master->pass_type = output_pass;
master->pass_number++;
- /*FALLTHROUGH*/
#endif
+ FALLTHROUGH /*FALLTHROUGH*/
case output_pass:
/* Do a data-output pass. */
/* We need not repeat per-scan setup if prior optimization pass did it. */
- if (! cinfo->optimize_coding) {
+ if (!cinfo->optimize_coding) {
select_scan_parameters(cinfo);
per_scan_setup(cinfo);
}
@@ -512,7 +513,7 @@ prepare_for_pass (j_compress_ptr cinfo)
ERREXIT(cinfo, JERR_NOT_COMPILED);
}
- master->pub.is_last_pass = (master->pass_number == master->total_passes-1);
+ master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
/* Set up progress monitor's pass info if present */
if (cinfo->progress != NULL) {
@@ -533,7 +534,7 @@ prepare_for_pass (j_compress_ptr cinfo)
*/
METHODDEF(void)
-pass_startup (j_compress_ptr cinfo)
+pass_startup(j_compress_ptr cinfo)
{
cinfo->master->call_pass_startup = FALSE; /* reset flag so call only once */
@@ -547,9 +548,9 @@ pass_startup (j_compress_ptr cinfo)
*/
METHODDEF(void)
-finish_pass_master (j_compress_ptr cinfo)
+finish_pass_master(j_compress_ptr cinfo)
{
- my_master_ptr master = (my_master_ptr) cinfo->master;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
/* The entropy coder always needs an end-of-pass call,
* either to analyze statistics or to flush its output buffer.
@@ -563,7 +564,7 @@ finish_pass_master (j_compress_ptr cinfo)
* or output of scan 1 (if no optimization).
*/
master->pass_type = output_pass;
- if (! cinfo->optimize_coding)
+ if (!cinfo->optimize_coding)
master->scan_number++;
break;
case huff_opt_pass:
@@ -587,14 +588,14 @@ finish_pass_master (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
+jinit_c_master_control(j_compress_ptr cinfo, boolean transcode_only)
{
my_master_ptr master;
master = (my_master_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- sizeof(my_comp_master));
- cinfo->master = (struct jpeg_comp_master *) master;
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_comp_master));
+ cinfo->master = (struct jpeg_comp_master *)master;
master->pub.prepare_for_pass = prepare_for_pass;
master->pub.pass_startup = pass_startup;
master->pub.finish_pass = finish_pass_master;
diff --git a/media/libjpeg/jcomapi.c b/media/libjpeg/jcomapi.c
index 6e5bf3dba9..efbb8357b0 100644
--- a/media/libjpeg/jcomapi.c
+++ b/media/libjpeg/jcomapi.c
@@ -29,7 +29,7 @@
*/
GLOBAL(void)
-jpeg_abort (j_common_ptr cinfo)
+jpeg_abort(j_common_ptr cinfo)
{
int pool;
@@ -40,7 +40,7 @@ jpeg_abort (j_common_ptr cinfo)
/* Releasing pools in reverse order might help avoid fragmentation
* with some (brain-damaged) malloc libraries.
*/
- for (pool = JPOOL_NUMPOOLS-1; pool > JPOOL_PERMANENT; pool--) {
+ for (pool = JPOOL_NUMPOOLS - 1; pool > JPOOL_PERMANENT; pool--) {
(*cinfo->mem->free_pool) (cinfo, pool);
}
@@ -50,7 +50,7 @@ jpeg_abort (j_common_ptr cinfo)
/* Try to keep application from accessing now-deleted marker list.
* A bit kludgy to do it here, but this is the most central place.
*/
- ((j_decompress_ptr) cinfo)->marker_list = NULL;
+ ((j_decompress_ptr)cinfo)->marker_list = NULL;
} else {
cinfo->global_state = CSTATE_START;
}
@@ -69,7 +69,7 @@ jpeg_abort (j_common_ptr cinfo)
*/
GLOBAL(void)
-jpeg_destroy (j_common_ptr cinfo)
+jpeg_destroy(j_common_ptr cinfo)
{
/* We need only tell the memory manager to release everything. */
/* NB: mem pointer is NULL if memory mgr failed to initialize. */
@@ -86,7 +86,7 @@ jpeg_destroy (j_common_ptr cinfo)
*/
GLOBAL(JQUANT_TBL *)
-jpeg_alloc_quant_table (j_common_ptr cinfo)
+jpeg_alloc_quant_table(j_common_ptr cinfo)
{
JQUANT_TBL *tbl;
@@ -98,7 +98,7 @@ jpeg_alloc_quant_table (j_common_ptr cinfo)
GLOBAL(JHUFF_TBL *)
-jpeg_alloc_huff_table (j_common_ptr cinfo)
+jpeg_alloc_huff_table(j_common_ptr cinfo)
{
JHUFF_TBL *tbl;
diff --git a/media/libjpeg/jconfigint.h b/media/libjpeg/jconfigint.h
index 1289399f94..02be97857f 100644
--- a/media/libjpeg/jconfigint.h
+++ b/media/libjpeg/jconfigint.h
@@ -1,7 +1,47 @@
-#define VERSION "1.5.1"
-#define BUILD "2016-09-20"
-#define PACKAGE_NAME "libjpeg-turbo"
+/* libjpeg-turbo build number */
+#define BUILD "20220624"
/* Need to use Mozilla-specific function inlining. */
#include "mozilla/Attributes.h"
#define INLINE MOZ_ALWAYS_INLINE
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "libjpeg-turbo"
+
+/* Version number of package */
+#define VERSION "2.1.3"
+
+/* The size of `size_t', as computed by sizeof. */
+#ifdef HAVE_64BIT_BUILD
+#define SIZEOF_SIZE_T 8
+#else
+#define SIZEOF_SIZE_T 4
+#endif
+
+/* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */
+#ifndef _MSC_VER
+#define HAVE_BUILTIN_CTZL 1
+#endif
+
+/* Define to 1 if you have the <intrin.h> header file. */
+#ifdef _MSC_VER
+#define HAVE_INTRIN_H 1
+#endif
+
+#if defined(_MSC_VER) && defined(HAVE_INTRIN_H)
+#if (SIZEOF_SIZE_T == 8)
+#define HAVE_BITSCANFORWARD64
+#elif (SIZEOF_SIZE_T == 4)
+#define HAVE_BITSCANFORWARD
+#endif
+#endif
+
+#if defined(__has_attribute)
+#if __has_attribute(fallthrough)
+#define FALLTHROUGH __attribute__((fallthrough));
+#else
+#define FALLTHROUGH
+#endif
+#else
+#define FALLTHROUGH
+#endif
diff --git a/media/libjpeg/jcparam.c b/media/libjpeg/jcparam.c
index 18b2d487ae..5bc7174dcb 100644
--- a/media/libjpeg/jcparam.c
+++ b/media/libjpeg/jcparam.c
@@ -5,7 +5,7 @@
* Copyright (C) 1991-1998, Thomas G. Lane.
* Modified 2003-2008 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, D. R. Commander.
+ * Copyright (C) 2009-2011, 2018, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -25,9 +25,9 @@
*/
GLOBAL(void)
-jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
- const unsigned int *basic_table,
- int scale_factor, boolean force_baseline)
+jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+ const unsigned int *basic_table, int scale_factor,
+ boolean force_baseline)
/* Define a quantization table equal to the basic_table times
* a scale factor (given as a percentage).
* If force_baseline is TRUE, the computed quantization table entries
@@ -45,19 +45,19 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
if (which_tbl < 0 || which_tbl >= NUM_QUANT_TBLS)
ERREXIT1(cinfo, JERR_DQT_INDEX, which_tbl);
- qtblptr = & cinfo->quant_tbl_ptrs[which_tbl];
+ qtblptr = &cinfo->quant_tbl_ptrs[which_tbl];
if (*qtblptr == NULL)
- *qtblptr = jpeg_alloc_quant_table((j_common_ptr) cinfo);
+ *qtblptr = jpeg_alloc_quant_table((j_common_ptr)cinfo);
for (i = 0; i < DCTSIZE2; i++) {
- temp = ((long) basic_table[i] * scale_factor + 50L) / 100L;
+ temp = ((long)basic_table[i] * scale_factor + 50L) / 100L;
/* limit the values to the valid range */
if (temp <= 0L) temp = 1L;
if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */
if (force_baseline && temp > 255L)
temp = 255L; /* limit to baseline range if requested */
- (*qtblptr)->quantval[i] = (UINT16) temp;
+ (*qtblptr)->quantval[i] = (UINT16)temp;
}
/* Initialize sent_table FALSE so table will be written to JPEG file. */
@@ -65,7 +65,8 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
}
-/* These are the sample quantization tables given in JPEG spec section K.1.
+/* These are the sample quantization tables given in Annex K (Clause K.1) of
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
* The spec says that the values given produce "good" quality, and
* when divided by 2, "very good" quality.
*/
@@ -93,7 +94,7 @@ static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
#if JPEG_LIB_VERSION >= 70
GLOBAL(void)
-jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline)
/* Set or change the 'quality' (quantization) setting, using default tables
* and straight percentage-scaling quality scales.
* This entry point allows different scalings for luminance and chrominance.
@@ -109,8 +110,8 @@ jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
GLOBAL(void)
-jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
- boolean force_baseline)
+jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+ boolean force_baseline)
/* Set or change the 'quality' (quantization) setting, using default tables
* and a straight percentage-scaling quality scale. In most cases it's better
* to use jpeg_set_quality (below); this entry point is provided for
@@ -126,7 +127,7 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
GLOBAL(int)
-jpeg_quality_scaling (int quality)
+jpeg_quality_scaling(int quality)
/* Convert a user-specified quality rating to a percentage scaling factor
* for an underlying quantization table, using our recommended scaling curve.
* The input 'quality' factor should be 0 (terrible) to 100 (very good).
@@ -145,14 +146,14 @@ jpeg_quality_scaling (int quality)
if (quality < 50)
quality = 5000 / quality;
else
- quality = 200 - quality*2;
+ quality = 200 - quality * 2;
return quality;
}
GLOBAL(void)
-jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
+jpeg_set_quality(j_compress_ptr cinfo, int quality, boolean force_baseline)
/* Set or change the 'quality' (quantization) setting, using default tables.
* This is the standard quality-adjusting entry point for typical user
* interfaces; only those who want detailed control over quantization tables
@@ -178,7 +179,7 @@ jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
*/
GLOBAL(void)
-jpeg_set_defaults (j_compress_ptr cinfo)
+jpeg_set_defaults(j_compress_ptr cinfo)
{
int i;
@@ -192,7 +193,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
*/
if (cinfo->comp_info == NULL)
cinfo->comp_info = (jpeg_component_info *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
MAX_COMPONENTS * sizeof(jpeg_component_info));
/* Initialize everything not dependent on the color space */
@@ -205,7 +206,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
/* Set up two quantization tables using default quality of 75 */
jpeg_set_quality(cinfo, 75, TRUE);
/* Set up two Huffman tables */
- std_huff_tables((j_common_ptr) cinfo);
+ std_huff_tables((j_common_ptr)cinfo);
/* Initialize default arithmetic coding conditioning */
for (i = 0; i < NUM_ARITH_TBLS; i++) {
@@ -278,7 +279,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_default_colorspace (j_compress_ptr cinfo)
+jpeg_default_colorspace(j_compress_ptr cinfo)
{
switch (cinfo->in_color_space) {
case JCS_GRAYSCALE:
@@ -320,12 +321,12 @@ jpeg_default_colorspace (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
+jpeg_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
{
jpeg_component_info *compptr;
int ci;
-#define SET_COMP(index,id,hsamp,vsamp,quant,dctbl,actbl) \
+#define SET_COMP(index, id, hsamp, vsamp, quant, dctbl, actbl) \
(compptr = &cinfo->comp_info[index], \
compptr->component_id = (id), \
compptr->h_samp_factor = (hsamp), \
@@ -352,39 +353,39 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
cinfo->num_components = 1;
/* JFIF specifies component ID 1 */
- SET_COMP(0, 1, 1,1, 0, 0,0);
+ SET_COMP(0, 1, 1, 1, 0, 0, 0);
break;
case JCS_RGB:
cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag RGB */
cinfo->num_components = 3;
- SET_COMP(0, 0x52 /* 'R' */, 1,1, 0, 0,0);
- SET_COMP(1, 0x47 /* 'G' */, 1,1, 0, 0,0);
- SET_COMP(2, 0x42 /* 'B' */, 1,1, 0, 0,0);
+ SET_COMP(0, 0x52 /* 'R' */, 1, 1, 0, 0, 0);
+ SET_COMP(1, 0x47 /* 'G' */, 1, 1, 0, 0, 0);
+ SET_COMP(2, 0x42 /* 'B' */, 1, 1, 0, 0, 0);
break;
case JCS_YCbCr:
cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
cinfo->num_components = 3;
/* JFIF specifies component IDs 1,2,3 */
/* We default to 2x2 subsamples of chrominance */
- SET_COMP(0, 1, 2,2, 0, 0,0);
- SET_COMP(1, 2, 1,1, 1, 1,1);
- SET_COMP(2, 3, 1,1, 1, 1,1);
+ SET_COMP(0, 1, 2, 2, 0, 0, 0);
+ SET_COMP(1, 2, 1, 1, 1, 1, 1);
+ SET_COMP(2, 3, 1, 1, 1, 1, 1);
break;
case JCS_CMYK:
cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag CMYK */
cinfo->num_components = 4;
- SET_COMP(0, 0x43 /* 'C' */, 1,1, 0, 0,0);
- SET_COMP(1, 0x4D /* 'M' */, 1,1, 0, 0,0);
- SET_COMP(2, 0x59 /* 'Y' */, 1,1, 0, 0,0);
- SET_COMP(3, 0x4B /* 'K' */, 1,1, 0, 0,0);
+ SET_COMP(0, 0x43 /* 'C' */, 1, 1, 0, 0, 0);
+ SET_COMP(1, 0x4D /* 'M' */, 1, 1, 0, 0, 0);
+ SET_COMP(2, 0x59 /* 'Y' */, 1, 1, 0, 0, 0);
+ SET_COMP(3, 0x4B /* 'K' */, 1, 1, 0, 0, 0);
break;
case JCS_YCCK:
cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag YCCK */
cinfo->num_components = 4;
- SET_COMP(0, 1, 2,2, 0, 0,0);
- SET_COMP(1, 2, 1,1, 1, 1,1);
- SET_COMP(2, 3, 1,1, 1, 1,1);
- SET_COMP(3, 4, 2,2, 0, 0,0);
+ SET_COMP(0, 1, 2, 2, 0, 0, 0);
+ SET_COMP(1, 2, 1, 1, 1, 1, 1);
+ SET_COMP(2, 3, 1, 1, 1, 1, 1);
+ SET_COMP(3, 4, 2, 2, 0, 0, 0);
break;
case JCS_UNKNOWN:
cinfo->num_components = cinfo->input_components;
@@ -392,7 +393,7 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
MAX_COMPONENTS);
for (ci = 0; ci < cinfo->num_components; ci++) {
- SET_COMP(ci, ci, 1,1, 0, 0,0);
+ SET_COMP(ci, ci, 1, 1, 0, 0, 0);
}
break;
default:
@@ -404,8 +405,7 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
#ifdef C_PROGRESSIVE_SUPPORTED
LOCAL(jpeg_scan_info *)
-fill_a_scan (jpeg_scan_info *scanptr, int ci,
- int Ss, int Se, int Ah, int Al)
+fill_a_scan(jpeg_scan_info *scanptr, int ci, int Ss, int Se, int Ah, int Al)
/* Support routine: generate one scan for specified component */
{
scanptr->comps_in_scan = 1;
@@ -419,8 +419,7 @@ fill_a_scan (jpeg_scan_info *scanptr, int ci,
}
LOCAL(jpeg_scan_info *)
-fill_scans (jpeg_scan_info *scanptr, int ncomps,
- int Ss, int Se, int Ah, int Al)
+fill_scans(jpeg_scan_info *scanptr, int ncomps, int Ss, int Se, int Ah, int Al)
/* Support routine: generate one scan for each component */
{
int ci;
@@ -438,7 +437,7 @@ fill_scans (jpeg_scan_info *scanptr, int ncomps,
}
LOCAL(jpeg_scan_info *)
-fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
+fill_dc_scans(jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
/* Support routine: generate interleaved DC scan if possible, else N scans */
{
int ci;
@@ -466,7 +465,7 @@ fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
*/
GLOBAL(void)
-jpeg_simple_progression (j_compress_ptr cinfo)
+jpeg_simple_progression(j_compress_ptr cinfo)
{
int ncomps = cinfo->num_components;
int nscans;
@@ -498,7 +497,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
if (cinfo->script_space == NULL || cinfo->script_space_size < nscans) {
cinfo->script_space_size = MAX(nscans, 10);
cinfo->script_space = (jpeg_scan_info *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
cinfo->script_space_size * sizeof(jpeg_scan_info));
}
scanptr = cinfo->script_space;
diff --git a/media/libjpeg/jcphuff.c b/media/libjpeg/jcphuff.c
index 046e2e18d4..872e570bff 100644
--- a/media/libjpeg/jcphuff.c
+++ b/media/libjpeg/jcphuff.c
@@ -4,7 +4,10 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander.
+ * Copyright (C) 2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ * Copyright (C) 2021, Alex Richardson.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -18,15 +21,74 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
-#include "jchuff.h" /* Declarations shared with jchuff.c */
+#include "jsimd.h"
+#include "jconfigint.h"
+#include <limits.h>
+
+#ifdef HAVE_INTRIN_H
+#include <intrin.h>
+#ifdef _MSC_VER
+#ifdef HAVE_BITSCANFORWARD64
+#pragma intrinsic(_BitScanForward64)
+#endif
+#ifdef HAVE_BITSCANFORWARD
+#pragma intrinsic(_BitScanForward)
+#endif
+#endif
+#endif
#ifdef C_PROGRESSIVE_SUPPORTED
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table. This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.) This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on Arm processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it. When building for Armv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+ defined(_M_ARM) || defined(_M_ARM64)
+#if !defined(__thumb__) || defined(__thumb2__)
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x))
+#else
+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
+#endif
+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+#include "jpeg_nbits_table.h"
+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
+#endif
+
+
/* Expanded entropy encoder object for progressive Huffman encoding. */
typedef struct {
struct jpeg_entropy_encoder pub; /* public fields */
+ /* Pointer to routine to prepare data for encode_mcu_AC_first() */
+ void (*AC_first_prepare) (const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits);
+ /* Pointer to routine to prepare data for encode_mcu_AC_refine() */
+ int (*AC_refine_prepare) (const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits);
+
/* Mode flag: TRUE for optimization, FALSE for actual data output */
boolean gather_statistics;
@@ -79,26 +141,62 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr;
#ifdef RIGHT_SHIFT_IS_UNSIGNED
#define ISHIFT_TEMPS int ishift_temp;
-#define IRIGHT_SHIFT(x,shft) \
- ((ishift_temp = (x)) < 0 ? \
- (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
- (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+ ((ishift_temp = (x)) < 0 ? \
+ (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+ (ishift_temp >> (shft)))
#else
#define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft) ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
#endif
+#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1)))
+
/* Forward declarations */
-METHODDEF(boolean) encode_mcu_DC_first (j_compress_ptr cinfo,
+METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(void) encode_mcu_AC_first_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
+METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_first (j_compress_ptr cinfo,
+METHODDEF(int) encode_mcu_AC_refine_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
+METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_DC_refine (j_compress_ptr cinfo,
- JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_refine (j_compress_ptr cinfo,
- JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_phuff (j_compress_ptr cinfo);
-METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo);
+
+
+/* Count bit loop zeroes */
+INLINE
+METHODDEF(int)
+count_zeroes(size_t *x)
+{
+#if defined(HAVE_BUILTIN_CTZL)
+ int result;
+ result = __builtin_ctzl(*x);
+ *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD64)
+ unsigned long result;
+ _BitScanForward64(&result, *x);
+ *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD)
+ unsigned long result;
+ _BitScanForward(&result, *x);
+ *x >>= result;
+#else
+ int result = 0;
+ while ((*x & 1) == 0) {
+ ++result;
+ *x >>= 1;
+ }
+#endif
+ return (int)result;
+}
/*
@@ -106,9 +204,9 @@ METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
*/
METHODDEF(void)
-start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
boolean is_DC_band;
int ci, tbl;
jpeg_component_info *compptr;
@@ -126,15 +224,23 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
entropy->pub.encode_mcu = encode_mcu_DC_first;
else
entropy->pub.encode_mcu = encode_mcu_AC_first;
+ if (jsimd_can_encode_mcu_AC_first_prepare())
+ entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
+ else
+ entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
} else {
if (is_DC_band)
entropy->pub.encode_mcu = encode_mcu_DC_refine;
else {
entropy->pub.encode_mcu = encode_mcu_AC_refine;
+ if (jsimd_can_encode_mcu_AC_refine_prepare())
+ entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare;
+ else
+ entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare;
/* AC refinement needs a correction bit buffer */
if (entropy->bit_buffer == NULL)
entropy->bit_buffer = (char *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
MAX_CORR_BITS * sizeof(char));
}
}
@@ -167,14 +273,14 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
/* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
if (entropy->count_ptrs[tbl] == NULL)
entropy->count_ptrs[tbl] = (long *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
- MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
+ memset(entropy->count_ptrs[tbl], 0, 257 * sizeof(long));
} else {
/* Compute derived values for Huffman table */
/* We may do this more than once for a table, but it's not expensive */
jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
- & entropy->derived_tbls[tbl]);
+ &entropy->derived_tbls[tbl]);
}
}
@@ -198,19 +304,20 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
*/
/* Emit a byte */
-#define emit_byte(entropy,val) \
- { *(entropy)->next_output_byte++ = (JOCTET) (val); \
- if (--(entropy)->free_in_buffer == 0) \
- dump_buffer(entropy); }
+#define emit_byte(entropy, val) { \
+ *(entropy)->next_output_byte++ = (JOCTET)(val); \
+ if (--(entropy)->free_in_buffer == 0) \
+ dump_buffer(entropy); \
+}
LOCAL(void)
-dump_buffer (phuff_entropy_ptr entropy)
+dump_buffer(phuff_entropy_ptr entropy)
/* Empty the output buffer; we do not support suspension in this module. */
{
struct jpeg_destination_mgr *dest = entropy->cinfo->dest;
- if (! (*dest->empty_output_buffer) (entropy->cinfo))
+ if (!(*dest->empty_output_buffer) (entropy->cinfo))
ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
/* After a successful buffer dump, must reset buffer pointers */
entropy->next_output_byte = dest->next_output_byte;
@@ -227,11 +334,11 @@ dump_buffer (phuff_entropy_ptr entropy)
*/
LOCAL(void)
-emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
+emit_bits(phuff_entropy_ptr entropy, unsigned int code, int size)
/* Emit some bits, unless we are in gather mode */
{
/* This routine is heavily used, so it's worth coding tightly. */
- register size_t put_buffer = (size_t) code;
+ register size_t put_buffer = (size_t)code;
register int put_bits = entropy->put_bits;
/* if size is 0, caller used an invalid Huffman table entry */
@@ -241,7 +348,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
if (entropy->gather_statistics)
return; /* do nothing if we're only getting stats */
- put_buffer &= (((size_t) 1)<<size) - 1; /* mask off any extra bits in code */
+ put_buffer &= (((size_t)1) << size) - 1; /* mask off any extra bits in code */
put_bits += size; /* new number of bits in buffer */
@@ -250,7 +357,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
while (put_bits >= 8) {
- int c = (int) ((put_buffer >> 16) & 0xFF);
+ int c = (int)((put_buffer >> 16) & 0xFF);
emit_byte(entropy, c);
if (c == 0xFF) { /* need to stuff a zero byte? */
@@ -266,7 +373,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
LOCAL(void)
-flush_bits (phuff_entropy_ptr entropy)
+flush_bits(phuff_entropy_ptr entropy)
{
emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */
entropy->put_buffer = 0; /* and reset bit-buffer to empty */
@@ -279,7 +386,7 @@ flush_bits (phuff_entropy_ptr entropy)
*/
LOCAL(void)
-emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
+emit_symbol(phuff_entropy_ptr entropy, int tbl_no, int symbol)
{
if (entropy->gather_statistics)
entropy->count_ptrs[tbl_no][symbol]++;
@@ -295,14 +402,14 @@ emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
*/
LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
- unsigned int nbits)
+emit_buffered_bits(phuff_entropy_ptr entropy, char *bufstart,
+ unsigned int nbits)
{
if (entropy->gather_statistics)
return; /* no real work */
while (nbits > 0) {
- emit_bits(entropy, (unsigned int) (*bufstart), 1);
+ emit_bits(entropy, (unsigned int)(*bufstart), 1);
bufstart++;
nbits--;
}
@@ -314,15 +421,13 @@ emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
*/
LOCAL(void)
-emit_eobrun (phuff_entropy_ptr entropy)
+emit_eobrun(phuff_entropy_ptr entropy)
{
register int temp, nbits;
if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */
temp = entropy->EOBRUN;
- nbits = 0;
- while ((temp >>= 1))
- nbits++;
+ nbits = JPEG_NBITS_NONZERO(temp) - 1;
/* safety check: shouldn't happen given limited correction-bit buffer */
if (nbits > 14)
ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
@@ -345,13 +450,13 @@ emit_eobrun (phuff_entropy_ptr entropy)
*/
LOCAL(void)
-emit_restart (phuff_entropy_ptr entropy, int restart_num)
+emit_restart(phuff_entropy_ptr entropy, int restart_num)
{
int ci;
emit_eobrun(entropy);
- if (! entropy->gather_statistics) {
+ if (!entropy->gather_statistics) {
flush_bits(entropy);
emit_byte(entropy, 0xFF);
emit_byte(entropy, JPEG_RST0 + restart_num);
@@ -375,10 +480,10 @@ emit_restart (phuff_entropy_ptr entropy, int restart_num)
*/
METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
- register int temp, temp2;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ register int temp, temp2, temp3;
register int nbits;
int blkn, ci;
int Al = cinfo->Al;
@@ -403,31 +508,31 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* Compute the DC value after the required point transform by Al.
* This is simply an arithmetic right shift.
*/
- temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
+ temp2 = IRIGHT_SHIFT((int)((*block)[0]), Al);
/* DC differences are figured on the point-transformed values. */
temp = temp2 - entropy->last_dc_val[ci];
entropy->last_dc_val[ci] = temp2;
/* Encode the DC coefficient difference per section G.1.2.1 */
- temp2 = temp;
- if (temp < 0) {
- temp = -temp; /* temp is abs value of input */
- /* For a negative input, want temp2 = bitwise complement of abs(input) */
- /* This code assumes we are on a two's complement machine */
- temp2--;
- }
+
+ /* This is a well-known technique for obtaining the absolute value without
+ * a branch. It is derived from an assembly language technique presented
+ * in "How to Optimize for the Pentium Processors", Copyright (c) 1996,
+ * 1997 by Agner Fog.
+ */
+ temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+ temp ^= temp3;
+ temp -= temp3; /* temp is abs value of input */
+ /* For a negative input, want temp2 = bitwise complement of abs(input) */
+ temp2 = temp ^ temp3;
/* Find the number of bits needed for the magnitude of the coefficient */
- nbits = 0;
- while (temp) {
- nbits++;
- temp >>= 1;
- }
+ nbits = JPEG_NBITS(temp);
/* Check for out-of-range coefficient values.
* Since we're encoding a difference, the range limit is twice as much.
*/
- if (nbits > MAX_COEF_BITS+1)
+ if (nbits > MAX_COEF_BITS + 1)
ERREXIT(cinfo, JERR_BAD_DCT_COEF);
/* Count/emit the Huffman-coded symbol for the number of bits */
@@ -436,7 +541,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* Emit that number of bits of the value, if positive, */
/* or the complement of its magnitude, if negative. */
if (nbits) /* emit_bits rejects calls with size 0 */
- emit_bits(entropy, (unsigned int) temp2, nbits);
+ emit_bits(entropy, (unsigned int)temp2, nbits);
}
cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -457,20 +562,115 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/*
+ * Data preparation for encode_mcu_AC_first().
+ */
+
+#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \
+ for (k = 0; k < Sl; k++) { \
+ temp = block[jpeg_natural_order_start[k]]; \
+ if (temp == 0) \
+ continue; \
+ /* We must apply the point transform by Al. For AC coefficients this \
+ * is an integer division with rounding towards 0. To do this portably \
+ * in C, we shift after obtaining the absolute value; so the code is \
+ * interwoven with finding the abs value (temp) and output bits (temp2). \
+ */ \
+ temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp ^= temp2; \
+ temp -= temp2; /* temp is abs value of input */ \
+ temp >>= Al; /* apply the point transform */ \
+ /* Watch out for case that nonzero coef is zero after point transform */ \
+ if (temp == 0) \
+ continue; \
+ /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
+ temp2 ^= temp; \
+ values[k] = (JCOEF)temp; \
+ values[k + DCTSIZE2] = (JCOEF)temp2; \
+ zerobits |= ((size_t)1U) << k; \
+ } \
+}
+
+METHODDEF(void)
+encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *bits)
+{
+ register int k, temp, temp2;
+ size_t zerobits = 0U;
+ int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+ if (Sl0 > 32)
+ Sl0 = 32;
+#endif
+
+ COMPUTE_ABSVALUES_AC_FIRST(Sl0);
+
+ bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 4
+ zerobits = 0U;
+
+ if (Sl > 32) {
+ Sl -= 32;
+ jpeg_natural_order_start += 32;
+ values += 32;
+
+ COMPUTE_ABSVALUES_AC_FIRST(Sl);
+ }
+ bits[1] = zerobits;
+#endif
+}
+
+/*
* MCU encoding for AC initial scan (either spectral selection,
* or first pass of successive approximation).
*/
+#define ENCODE_COEFS_AC_FIRST(label) { \
+ while (zerobits) { \
+ r = count_zeroes(&zerobits); \
+ cvalue += r; \
+label \
+ temp = cvalue[0]; \
+ temp2 = cvalue[DCTSIZE2]; \
+ \
+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
+ while (r > 15) { \
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+ r -= 16; \
+ } \
+ \
+ /* Find the number of bits needed for the magnitude of the coefficient */ \
+ nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ \
+ /* Check for out-of-range coefficient values */ \
+ if (nbits > MAX_COEF_BITS) \
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
+ \
+ /* Count/emit Huffman symbol for run length / number of bits */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \
+ \
+ /* Emit that number of bits of the value, if positive, */ \
+ /* or the complement of its magnitude, if negative. */ \
+ emit_bits(entropy, (unsigned int)temp2, nbits); \
+ \
+ cvalue++; \
+ zerobits >>= 1; \
+ } \
+}
+
METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
register int temp, temp2;
- register int nbits;
- register int r, k;
- int Se = cinfo->Se;
+ register int nbits, r;
+ int Sl = cinfo->Se - cinfo->Ss + 1;
int Al = cinfo->Al;
- JBLOCKROW block;
+ JCOEF values_unaligned[2 * DCTSIZE2 + 15];
+ JCOEF *values;
+ const JCOEF *cvalue;
+ size_t zerobits;
+ size_t bits[8 / SIZEOF_SIZE_T];
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -480,66 +680,48 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
if (entropy->restarts_to_go == 0)
emit_restart(entropy, entropy->next_restart_num);
- /* Encode the MCU data block */
- block = MCU_data[0];
-
- /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
+#ifdef WITH_SIMD
+ cvalue = values = (JCOEF *)PAD((JUINTPTR)values_unaligned, 16);
+#else
+ /* Not using SIMD, so alignment is not needed */
+ cvalue = values = values_unaligned;
+#endif
- r = 0; /* r = run length of zeros */
+ /* Prepare data */
+ entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+ Sl, Al, values, bits);
- for (k = cinfo->Ss; k <= Se; k++) {
- if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
- r++;
- continue;
- }
- /* We must apply the point transform by Al. For AC coefficients this
- * is an integer division with rounding towards 0. To do this portably
- * in C, we shift after obtaining the absolute value; so the code is
- * interwoven with finding the abs value (temp) and output bits (temp2).
- */
- if (temp < 0) {
- temp = -temp; /* temp is abs value of input */
- temp >>= Al; /* apply the point transform */
- /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
- temp2 = ~temp;
- } else {
- temp >>= Al; /* apply the point transform */
- temp2 = temp;
- }
- /* Watch out for case that nonzero coef is zero after point transform */
- if (temp == 0) {
- r++;
- continue;
- }
+ zerobits = bits[0];
+#if SIZEOF_SIZE_T == 4
+ zerobits |= bits[1];
+#endif
- /* Emit any pending EOBRUN */
- if (entropy->EOBRUN > 0)
- emit_eobrun(entropy);
- /* if run length > 15, must emit special run-length-16 codes (0xF0) */
- while (r > 15) {
- emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
- r -= 16;
- }
+ /* Emit any pending EOBRUN */
+ if (zerobits && (entropy->EOBRUN > 0))
+ emit_eobrun(entropy);
- /* Find the number of bits needed for the magnitude of the coefficient */
- nbits = 1; /* there must be at least one 1 bit */
- while ((temp >>= 1))
- nbits++;
- /* Check for out-of-range coefficient values */
- if (nbits > MAX_COEF_BITS)
- ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[0];
+#endif
- /* Count/emit Huffman symbol for run length / number of bits */
- emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
+ /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
- /* Emit that number of bits of the value, if positive, */
- /* or the complement of its magnitude, if negative. */
- emit_bits(entropy, (unsigned int) temp2, nbits);
+ ENCODE_COEFS_AC_FIRST((void)0;);
- r = 0; /* reset zero run length */
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[1];
+ if (zerobits) {
+ int diff = ((values + DCTSIZE2 / 2) - cvalue);
+ r = count_zeroes(&zerobits);
+ r += diff;
+ cvalue += r;
+ goto first_iter_ac_first;
}
- if (r > 0) { /* If there are trailing zeroes, */
+ ENCODE_COEFS_AC_FIRST(first_iter_ac_first:);
+#endif
+
+ if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */
entropy->EOBRUN++; /* count an EOB */
if (entropy->EOBRUN == 0x7FFF)
emit_eobrun(entropy); /* force it out to avoid overflow */
@@ -569,9 +751,9 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
register int temp;
int blkn;
int Al = cinfo->Al;
@@ -591,7 +773,7 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* We simply emit the Al'th bit of the DC coefficient value. */
temp = (*block)[0];
- emit_bits(entropy, (unsigned int) (temp >> Al), 1);
+ emit_bits(entropy, (unsigned int)(temp >> Al), 1);
}
cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -612,22 +794,148 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/*
+ * Data preparation for encode_mcu_AC_refine().
+ */
+
+#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \
+ /* It is convenient to make a pre-pass to determine the transformed \
+ * coefficients' absolute values and the EOB position. \
+ */ \
+ for (k = 0; k < Sl; k++) { \
+ temp = block[jpeg_natural_order_start[k]]; \
+ /* We must apply the point transform by Al. For AC coefficients this \
+ * is an integer division with rounding towards 0. To do this portably \
+ * in C, we shift after obtaining the absolute value. \
+ */ \
+ temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp ^= temp2; \
+ temp -= temp2; /* temp is abs value of input */ \
+ temp >>= Al; /* apply the point transform */ \
+ if (temp != 0) { \
+ zerobits |= ((size_t)1U) << k; \
+ signbits |= ((size_t)(temp2 + 1)) << k; \
+ } \
+ absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \
+ if (temp == 1) \
+ EOB = k + koffset; /* EOB = index of last newly-nonzero coef */ \
+ } \
+}
+
+METHODDEF(int)
+encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ register int k, temp, temp2;
+ int EOB = 0;
+ size_t zerobits = 0U, signbits = 0U;
+ int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+ if (Sl0 > 32)
+ Sl0 = 32;
+#endif
+
+ COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0);
+
+ bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 8
+ bits[1] = signbits;
+#else
+ bits[2] = signbits;
+
+ zerobits = 0U;
+ signbits = 0U;
+
+ if (Sl > 32) {
+ Sl -= 32;
+ jpeg_natural_order_start += 32;
+ absvalues += 32;
+
+ COMPUTE_ABSVALUES_AC_REFINE(Sl, 32);
+ }
+
+ bits[1] = zerobits;
+ bits[3] = signbits;
+#endif
+
+ return EOB;
+}
+
+
+/*
* MCU encoding for AC successive approximation refinement scan.
*/
+#define ENCODE_COEFS_AC_REFINE(label) { \
+ while (zerobits) { \
+ idx = count_zeroes(&zerobits); \
+ r += idx; \
+ cabsvalue += idx; \
+ signbits >>= idx; \
+label \
+ /* Emit any required ZRLs, but not if they can be folded into EOB */ \
+ while (r > 15 && (cabsvalue <= EOBPTR)) { \
+ /* emit any pending EOBRUN and the BE correction bits */ \
+ emit_eobrun(entropy); \
+ /* Emit ZRL */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+ r -= 16; \
+ /* Emit buffered correction bits that must be associated with ZRL */ \
+ emit_buffered_bits(entropy, BR_buffer, BR); \
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+ BR = 0; \
+ } \
+ \
+ temp = *cabsvalue++; \
+ \
+ /* If the coef was previously nonzero, it only needs a correction bit. \
+ * NOTE: a straight translation of the spec's figure G.7 would suggest \
+ * that we also need to test r > 15. But if r > 15, we can only get here \
+ * if k > EOB, which implies that this coefficient is not 1. \
+ */ \
+ if (temp > 1) { \
+ /* The correction bit is the next bit of the absolute value. */ \
+ BR_buffer[BR++] = (char)(temp & 1); \
+ signbits >>= 1; \
+ zerobits >>= 1; \
+ continue; \
+ } \
+ \
+ /* Emit any pending EOBRUN and the BE correction bits */ \
+ emit_eobrun(entropy); \
+ \
+ /* Count/emit Huffman symbol for run length / number of bits */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \
+ \
+ /* Emit output bit for newly-nonzero coef */ \
+ temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \
+ emit_bits(entropy, (unsigned int)temp, 1); \
+ \
+ /* Emit buffered correction bits that must be associated with this code */ \
+ emit_buffered_bits(entropy, BR_buffer, BR); \
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+ BR = 0; \
+ r = 0; /* reset zero run length */ \
+ signbits >>= 1; \
+ zerobits >>= 1; \
+ } \
+}
+
METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
- register int temp;
- register int r, k;
- int EOB;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ register int temp, r, idx;
char *BR_buffer;
unsigned int BR;
- int Se = cinfo->Se;
+ int Sl = cinfo->Se - cinfo->Ss + 1;
int Al = cinfo->Al;
- JBLOCKROW block;
- int absvalues[DCTSIZE2];
+ JCOEF absvalues_unaligned[DCTSIZE2 + 15];
+ JCOEF *absvalues;
+ const JCOEF *cabsvalue, *EOBPTR;
+ size_t zerobits, signbits;
+ size_t bits[16 / SIZEOF_SIZE_T];
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -637,26 +945,17 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
if (entropy->restarts_to_go == 0)
emit_restart(entropy, entropy->next_restart_num);
- /* Encode the MCU data block */
- block = MCU_data[0];
+#ifdef WITH_SIMD
+ cabsvalue = absvalues = (JCOEF *)PAD((JUINTPTR)absvalues_unaligned, 16);
+#else
+ /* Not using SIMD, so alignment is not needed */
+ cabsvalue = absvalues = absvalues_unaligned;
+#endif
- /* It is convenient to make a pre-pass to determine the transformed
- * coefficients' absolute values and the EOB position.
- */
- EOB = 0;
- for (k = cinfo->Ss; k <= Se; k++) {
- temp = (*block)[jpeg_natural_order[k]];
- /* We must apply the point transform by Al. For AC coefficients this
- * is an integer division with rounding towards 0. To do this portably
- * in C, we shift after obtaining the absolute value.
- */
- if (temp < 0)
- temp = -temp; /* temp is abs value of input */
- temp >>= Al; /* apply the point transform */
- absvalues[k] = temp; /* save abs value for main pass */
- if (temp == 1)
- EOB = k; /* EOB = index of last newly-nonzero coef */
- }
+ /* Prepare data */
+ EOBPTR = absvalues +
+ entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+ Sl, Al, absvalues, bits);
/* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
@@ -664,52 +963,32 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
BR = 0; /* BR = count of buffered bits added now */
BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
- for (k = cinfo->Ss; k <= Se; k++) {
- if ((temp = absvalues[k]) == 0) {
- r++;
- continue;
- }
-
- /* Emit any required ZRLs, but not if they can be folded into EOB */
- while (r > 15 && k <= EOB) {
- /* emit any pending EOBRUN and the BE correction bits */
- emit_eobrun(entropy);
- /* Emit ZRL */
- emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
- r -= 16;
- /* Emit buffered correction bits that must be associated with ZRL */
- emit_buffered_bits(entropy, BR_buffer, BR);
- BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
- BR = 0;
- }
-
- /* If the coef was previously nonzero, it only needs a correction bit.
- * NOTE: a straight translation of the spec's figure G.7 would suggest
- * that we also need to test r > 15. But if r > 15, we can only get here
- * if k > EOB, which implies that this coefficient is not 1.
- */
- if (temp > 1) {
- /* The correction bit is the next bit of the absolute value. */
- BR_buffer[BR++] = (char) (temp & 1);
- continue;
- }
-
- /* Emit any pending EOBRUN and the BE correction bits */
- emit_eobrun(entropy);
-
- /* Count/emit Huffman symbol for run length / number of bits */
- emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
+ zerobits = bits[0];
+#if SIZEOF_SIZE_T == 8
+ signbits = bits[1];
+#else
+ signbits = bits[2];
+#endif
+ ENCODE_COEFS_AC_REFINE((void)0;);
+
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[1];
+ signbits = bits[3];
+
+ if (zerobits) {
+ int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
+ idx = count_zeroes(&zerobits);
+ signbits >>= idx;
+ idx += diff;
+ r += idx;
+ cabsvalue += idx;
+ goto first_iter_ac_refine;
+ }
- /* Emit output bit for newly-nonzero coef */
- temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
- emit_bits(entropy, (unsigned int) temp, 1);
+ ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:);
+#endif
- /* Emit buffered correction bits that must be associated with this code */
- emit_buffered_bits(entropy, BR_buffer, BR);
- BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
- BR = 0;
- r = 0; /* reset zero run length */
- }
+ r |= (int)((absvalues + Sl) - cabsvalue);
if (r > 0 || BR > 0) { /* If there are trailing zeroes, */
entropy->EOBRUN++; /* count an EOB */
@@ -718,7 +997,8 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
* 1. overflow of the EOB counter;
* 2. overflow of the correction bit buffer during the next MCU.
*/
- if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
+ if (entropy->EOBRUN == 0x7FFF ||
+ entropy->BE > (MAX_CORR_BITS - DCTSIZE2 + 1))
emit_eobrun(entropy);
}
@@ -744,9 +1024,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(void)
-finish_pass_phuff (j_compress_ptr cinfo)
+finish_pass_phuff(j_compress_ptr cinfo)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -765,9 +1045,9 @@ finish_pass_phuff (j_compress_ptr cinfo)
*/
METHODDEF(void)
-finish_pass_gather_phuff (j_compress_ptr cinfo)
+finish_pass_gather_phuff(j_compress_ptr cinfo)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
boolean is_DC_band;
int ci, tbl;
jpeg_component_info *compptr;
@@ -782,7 +1062,7 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
/* It's important not to apply jpeg_gen_optimal_table more than once
* per table, because it clobbers the input frequency counts!
*/
- MEMZERO(did, sizeof(did));
+ memset(did, 0, sizeof(did));
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
@@ -793,13 +1073,13 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
} else {
tbl = compptr->ac_tbl_no;
}
- if (! did[tbl]) {
+ if (!did[tbl]) {
if (is_DC_band)
- htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
+ htblptr = &cinfo->dc_huff_tbl_ptrs[tbl];
else
- htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
+ htblptr = &cinfo->ac_huff_tbl_ptrs[tbl];
if (*htblptr == NULL)
- *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
did[tbl] = TRUE;
}
@@ -812,15 +1092,15 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jinit_phuff_encoder (j_compress_ptr cinfo)
+jinit_phuff_encoder(j_compress_ptr cinfo)
{
phuff_entropy_ptr entropy;
int i;
entropy = (phuff_entropy_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(phuff_entropy_encoder));
- cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+ cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
entropy->pub.start_pass = start_pass_phuff;
/* Mark tables unallocated */
diff --git a/media/libjpeg/jcprepct.c b/media/libjpeg/jcprepct.c
index e72ebd87d2..f27cc34507 100644
--- a/media/libjpeg/jcprepct.c
+++ b/media/libjpeg/jcprepct.c
@@ -3,8 +3,8 @@
*
* This file is part of the Independent JPEG Group's software:
* Copyright (C) 1994-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -78,9 +78,9 @@ typedef my_prep_controller *my_prep_ptr;
*/
METHODDEF(void)
-start_pass_prep (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_prep(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
{
- my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+ my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
if (pass_mode != JBUF_PASS_THRU)
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -106,14 +106,14 @@ start_pass_prep (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
*/
LOCAL(void)
-expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols,
- int input_rows, int output_rows)
+expand_bottom_edge(JSAMPARRAY image_data, JDIMENSION num_cols, int input_rows,
+ int output_rows)
{
register int row;
for (row = input_rows; row < output_rows; row++) {
- jcopy_sample_rows(image_data, input_rows-1, image_data, row,
- 1, num_cols);
+ jcopy_sample_rows(image_data, input_rows - 1, image_data, row, 1,
+ num_cols);
}
}
@@ -128,13 +128,12 @@ expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols,
*/
METHODDEF(void)
-pre_process_data (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
- JDIMENSION in_rows_avail,
- JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
- JDIMENSION out_row_groups_avail)
+pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+ JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+ JDIMENSION out_row_groups_avail)
{
- my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+ my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
int numrows, ci;
JDIMENSION inrows;
jpeg_component_info *compptr;
@@ -144,10 +143,10 @@ pre_process_data (j_compress_ptr cinfo,
/* Do color conversion to fill the conversion buffer. */
inrows = in_rows_avail - *in_row_ctr;
numrows = cinfo->max_v_samp_factor - prep->next_buf_row;
- numrows = (int) MIN((JDIMENSION) numrows, inrows);
+ numrows = (int)MIN((JDIMENSION)numrows, inrows);
(*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
prep->color_buf,
- (JDIMENSION) prep->next_buf_row,
+ (JDIMENSION)prep->next_buf_row,
numrows);
*in_row_ctr += numrows;
prep->next_buf_row += numrows;
@@ -164,7 +163,7 @@ pre_process_data (j_compress_ptr cinfo,
/* If we've filled the conversion buffer, empty it. */
if (prep->next_buf_row == cinfo->max_v_samp_factor) {
(*cinfo->downsample->downsample) (cinfo,
- prep->color_buf, (JDIMENSION) 0,
+ prep->color_buf, (JDIMENSION)0,
output_buf, *out_row_group_ctr);
prep->next_buf_row = 0;
(*out_row_group_ctr)++;
@@ -172,14 +171,12 @@ pre_process_data (j_compress_ptr cinfo,
/* If at bottom of image, pad the output to a full iMCU height.
* Note we assume the caller is providing a one-iMCU-height output buffer!
*/
- if (prep->rows_to_go == 0 &&
- *out_row_group_ctr < out_row_groups_avail) {
+ if (prep->rows_to_go == 0 && *out_row_group_ctr < out_row_groups_avail) {
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- expand_bottom_edge(output_buf[ci],
- compptr->width_in_blocks * DCTSIZE,
- (int) (*out_row_group_ctr * compptr->v_samp_factor),
- (int) (out_row_groups_avail * compptr->v_samp_factor));
+ expand_bottom_edge(output_buf[ci], compptr->width_in_blocks * DCTSIZE,
+ (int)(*out_row_group_ctr * compptr->v_samp_factor),
+ (int)(out_row_groups_avail * compptr->v_samp_factor));
}
*out_row_group_ctr = out_row_groups_avail;
break; /* can exit outer loop without test */
@@ -195,13 +192,12 @@ pre_process_data (j_compress_ptr cinfo,
*/
METHODDEF(void)
-pre_process_context (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
- JDIMENSION in_rows_avail,
- JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
- JDIMENSION out_row_groups_avail)
+pre_process_context(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+ JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+ JDIMENSION out_row_groups_avail)
{
- my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+ my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
int numrows, ci;
int buf_height = cinfo->max_v_samp_factor * 3;
JDIMENSION inrows;
@@ -211,19 +207,18 @@ pre_process_context (j_compress_ptr cinfo,
/* Do color conversion to fill the conversion buffer. */
inrows = in_rows_avail - *in_row_ctr;
numrows = prep->next_buf_stop - prep->next_buf_row;
- numrows = (int) MIN((JDIMENSION) numrows, inrows);
+ numrows = (int)MIN((JDIMENSION)numrows, inrows);
(*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
prep->color_buf,
- (JDIMENSION) prep->next_buf_row,
+ (JDIMENSION)prep->next_buf_row,
numrows);
/* Pad at top of image, if first time through */
if (prep->rows_to_go == cinfo->image_height) {
for (ci = 0; ci < cinfo->num_components; ci++) {
int row;
for (row = 1; row <= cinfo->max_v_samp_factor; row++) {
- jcopy_sample_rows(prep->color_buf[ci], 0,
- prep->color_buf[ci], -row,
- 1, cinfo->image_width);
+ jcopy_sample_rows(prep->color_buf[ci], 0, prep->color_buf[ci],
+ -row, 1, cinfo->image_width);
}
}
}
@@ -245,9 +240,8 @@ pre_process_context (j_compress_ptr cinfo,
}
/* If we've gotten enough data, downsample a row group. */
if (prep->next_buf_row == prep->next_buf_stop) {
- (*cinfo->downsample->downsample) (cinfo,
- prep->color_buf,
- (JDIMENSION) prep->this_row_group,
+ (*cinfo->downsample->downsample) (cinfo, prep->color_buf,
+ (JDIMENSION)prep->this_row_group,
output_buf, *out_row_group_ctr);
(*out_row_group_ctr)++;
/* Advance pointers with wraparound as necessary. */
@@ -267,9 +261,9 @@ pre_process_context (j_compress_ptr cinfo,
*/
LOCAL(void)
-create_context_buffer (j_compress_ptr cinfo)
+create_context_buffer(j_compress_ptr cinfo)
{
- my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+ my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
int rgroup_height = cinfo->max_v_samp_factor;
int ci, i;
jpeg_component_info *compptr;
@@ -279,7 +273,7 @@ create_context_buffer (j_compress_ptr cinfo)
* we need five row groups' worth of pointers for each component.
*/
fake_buffer = (JSAMPARRAY)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
(cinfo->num_components * 5 * rgroup_height) *
sizeof(JSAMPROW));
@@ -290,13 +284,13 @@ create_context_buffer (j_compress_ptr cinfo)
* horizontally within the buffer, if it so chooses.
*/
true_buffer = (*cinfo->mem->alloc_sarray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
- cinfo->max_h_samp_factor) / compptr->h_samp_factor),
- (JDIMENSION) (3 * rgroup_height));
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+ cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+ (JDIMENSION)(3 * rgroup_height));
/* Copy true buffer row pointers into the middle of the fake row array */
- MEMCOPY(fake_buffer + rgroup_height, true_buffer,
- 3 * rgroup_height * sizeof(JSAMPROW));
+ memcpy(fake_buffer + rgroup_height, true_buffer,
+ 3 * rgroup_height * sizeof(JSAMPROW));
/* Fill in the above and below wraparound pointers */
for (i = 0; i < rgroup_height; i++) {
fake_buffer[i] = true_buffer[2 * rgroup_height + i];
@@ -315,7 +309,7 @@ create_context_buffer (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_prep_controller(j_compress_ptr cinfo, boolean need_full_buffer)
{
my_prep_ptr prep;
int ci;
@@ -325,9 +319,9 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
prep = (my_prep_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_prep_controller));
- cinfo->prep = (struct jpeg_c_prep_controller *) prep;
+ cinfo->prep = (struct jpeg_c_prep_controller *)prep;
prep->pub.start_pass = start_pass_prep;
/* Allocate the color conversion buffer.
@@ -348,10 +342,10 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
- cinfo->max_h_samp_factor) / compptr->h_samp_factor),
- (JDIMENSION) cinfo->max_v_samp_factor);
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+ cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+ (JDIMENSION)cinfo->max_v_samp_factor);
}
}
}
diff --git a/media/libjpeg/jcsample.c b/media/libjpeg/jcsample.c
index c4b4991487..e8515ebf0f 100644
--- a/media/libjpeg/jcsample.c
+++ b/media/libjpeg/jcsample.c
@@ -6,7 +6,7 @@
* libjpeg-turbo Modifications:
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2019, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -79,7 +79,7 @@ typedef my_downsampler *my_downsample_ptr;
*/
METHODDEF(void)
-start_pass_downsample (j_compress_ptr cinfo)
+start_pass_downsample(j_compress_ptr cinfo)
{
/* no work for now */
}
@@ -91,19 +91,19 @@ start_pass_downsample (j_compress_ptr cinfo)
*/
LOCAL(void)
-expand_right_edge (JSAMPARRAY image_data, int num_rows,
- JDIMENSION input_cols, JDIMENSION output_cols)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+ JDIMENSION output_cols)
{
register JSAMPROW ptr;
register JSAMPLE pixval;
register int count;
int row;
- int numcols = (int) (output_cols - input_cols);
+ int numcols = (int)(output_cols - input_cols);
if (numcols > 0) {
for (row = 0; row < num_rows; row++) {
ptr = image_data[row] + input_cols;
- pixval = ptr[-1]; /* don't need GETJSAMPLE() here */
+ pixval = ptr[-1];
for (count = numcols; count > 0; count--)
*ptr++ = pixval;
}
@@ -118,11 +118,11 @@ expand_right_edge (JSAMPARRAY image_data, int num_rows,
*/
METHODDEF(void)
-sep_downsample (j_compress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION in_row_index,
- JSAMPIMAGE output_buf, JDIMENSION out_row_group_index)
+sep_downsample(j_compress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_index, JSAMPIMAGE output_buf,
+ JDIMENSION out_row_group_index)
{
- my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
+ my_downsample_ptr downsample = (my_downsample_ptr)cinfo->downsample;
int ci;
jpeg_component_info *compptr;
JSAMPARRAY in_ptr, out_ptr;
@@ -144,8 +144,8 @@ sep_downsample (j_compress_ptr cinfo,
*/
METHODDEF(void)
-int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
JDIMENSION outcol, outcol_h; /* outcol_h == outcol*h_expand */
@@ -156,14 +156,14 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
numpix = h_expand * v_expand;
- numpix2 = numpix/2;
+ numpix2 = numpix / 2;
/* Expand input data enough to let all the output samples be generated
* by the standard loop. Special-casing padded output would be more
* efficient.
*/
- expand_right_edge(input_data, cinfo->max_v_samp_factor,
- cinfo->image_width, output_cols * h_expand);
+ expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+ output_cols * h_expand);
inrow = 0;
for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
@@ -172,12 +172,12 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
outcol++, outcol_h += h_expand) {
outvalue = 0;
for (v = 0; v < v_expand; v++) {
- inptr = input_data[inrow+v] + outcol_h;
+ inptr = input_data[inrow + v] + outcol_h;
for (h = 0; h < h_expand; h++) {
- outvalue += (JLONG) GETJSAMPLE(*inptr++);
+ outvalue += (JLONG)(*inptr++);
}
}
- *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
+ *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
}
inrow += v_expand;
}
@@ -191,15 +191,15 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+fullsize_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
/* Copy the data */
- jcopy_sample_rows(input_data, 0, output_data, 0,
- cinfo->max_v_samp_factor, cinfo->image_width);
+ jcopy_sample_rows(input_data, 0, output_data, 0, cinfo->max_v_samp_factor,
+ cinfo->image_width);
/* Edge-expand */
- expand_right_edge(output_data, cinfo->max_v_samp_factor,
- cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+ expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+ compptr->width_in_blocks * DCTSIZE);
}
@@ -216,8 +216,8 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
int outrow;
JDIMENSION outcol;
@@ -229,16 +229,15 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
* by the standard loop. Special-casing padded output would be more
* efficient.
*/
- expand_right_edge(input_data, cinfo->max_v_samp_factor,
- cinfo->image_width, output_cols * 2);
+ expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+ output_cols * 2);
for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr = input_data[outrow];
bias = 0; /* bias = 0,1,0,1,... for successive samples */
for (outcol = 0; outcol < output_cols; outcol++) {
- *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
- + bias) >> 1);
+ *outptr++ = (JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
bias ^= 1; /* 0=>1, 1=>0 */
inptr += 2;
}
@@ -253,8 +252,8 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
int inrow, outrow;
JDIMENSION outcol;
@@ -266,21 +265,20 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
* by the standard loop. Special-casing padded output would be more
* efficient.
*/
- expand_right_edge(input_data, cinfo->max_v_samp_factor,
- cinfo->image_width, output_cols * 2);
+ expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+ output_cols * 2);
inrow = 0;
for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr0 = input_data[inrow];
- inptr1 = input_data[inrow+1];
+ inptr1 = input_data[inrow + 1];
bias = 1; /* bias = 1,2,1,2,... for successive samples */
for (outcol = 0; outcol < output_cols; outcol++) {
- *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
- GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1])
- + bias) >> 2);
+ *outptr++ =
+ (JSAMPLE)((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
bias ^= 3; /* 1=>2, 2=>1 */
- inptr0 += 2; inptr1 += 2;
+ inptr0 += 2; inptr1 += 2;
}
inrow += 2;
}
@@ -296,8 +294,8 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
int inrow, outrow;
JDIMENSION colctr;
@@ -332,57 +330,45 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr0 = input_data[inrow];
- inptr1 = input_data[inrow+1];
- above_ptr = input_data[inrow-1];
- below_ptr = input_data[inrow+2];
+ inptr1 = input_data[inrow + 1];
+ above_ptr = input_data[inrow - 1];
+ below_ptr = input_data[inrow + 2];
/* Special case for first column: pretend column -1 is same as column 0 */
- membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
- GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
- neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
- GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
- GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) +
- GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]);
+ membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+ neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+ inptr0[0] + inptr0[2] + inptr1[0] + inptr1[2];
neighsum += neighsum;
- neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) +
- GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
+ neighsum += above_ptr[0] + above_ptr[2] + below_ptr[0] + below_ptr[2];
membersum = membersum * memberscale + neighsum * neighscale;
- *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
- inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
+ *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+ inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
for (colctr = output_cols - 2; colctr > 0; colctr--) {
/* sum of pixels directly mapped to this output element */
- membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
- GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+ membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
/* sum of edge-neighbor pixels */
- neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
- GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
- GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) +
- GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]);
+ neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+ inptr0[-1] + inptr0[2] + inptr1[-1] + inptr1[2];
/* The edge-neighbors count twice as much as corner-neighbors */
neighsum += neighsum;
/* Add in the corner-neighbors */
- neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) +
- GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]);
+ neighsum += above_ptr[-1] + above_ptr[2] + below_ptr[-1] + below_ptr[2];
/* form final output scaled up by 2^16 */
membersum = membersum * memberscale + neighsum * neighscale;
/* round, descale and output it */
- *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
- inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
+ *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+ inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
}
/* Special case for last column */
- membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
- GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
- neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
- GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
- GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) +
- GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]);
+ membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+ neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+ inptr0[-1] + inptr0[1] + inptr1[-1] + inptr1[1];
neighsum += neighsum;
- neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) +
- GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
+ neighsum += above_ptr[-1] + above_ptr[1] + below_ptr[-1] + below_ptr[1];
membersum = membersum * memberscale + neighsum * neighscale;
- *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
+ *outptr = (JSAMPLE)((membersum + 32768) >> 16);
inrow += 2;
}
@@ -396,8 +382,8 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
int outrow;
JDIMENSION colctr;
@@ -425,36 +411,33 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr = input_data[outrow];
- above_ptr = input_data[outrow-1];
- below_ptr = input_data[outrow+1];
+ above_ptr = input_data[outrow - 1];
+ below_ptr = input_data[outrow + 1];
/* Special case for first column */
- colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
- GETJSAMPLE(*inptr);
- membersum = GETJSAMPLE(*inptr++);
- nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
- GETJSAMPLE(*inptr);
+ colsum = (*above_ptr++) + (*below_ptr++) + inptr[0];
+ membersum = *inptr++;
+ nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
neighsum = colsum + (colsum - membersum) + nextcolsum;
membersum = membersum * memberscale + neighsum * neighscale;
- *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
- lastcolsum = colsum; colsum = nextcolsum;
+ *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+ lastcolsum = colsum; colsum = nextcolsum;
for (colctr = output_cols - 2; colctr > 0; colctr--) {
- membersum = GETJSAMPLE(*inptr++);
- above_ptr++; below_ptr++;
- nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
- GETJSAMPLE(*inptr);
+ membersum = *inptr++;
+ above_ptr++; below_ptr++;
+ nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
membersum = membersum * memberscale + neighsum * neighscale;
- *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
- lastcolsum = colsum; colsum = nextcolsum;
+ *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+ lastcolsum = colsum; colsum = nextcolsum;
}
/* Special case for last column */
- membersum = GETJSAMPLE(*inptr);
+ membersum = *inptr;
neighsum = lastcolsum + (colsum - membersum) + colsum;
membersum = membersum * memberscale + neighsum * neighscale;
- *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
+ *outptr = (JSAMPLE)((membersum + 32768) >> 16);
}
}
@@ -468,7 +451,7 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jinit_downsampler (j_compress_ptr cinfo)
+jinit_downsampler(j_compress_ptr cinfo)
{
my_downsample_ptr downsample;
int ci;
@@ -476,9 +459,9 @@ jinit_downsampler (j_compress_ptr cinfo)
boolean smoothok = TRUE;
downsample = (my_downsample_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_downsampler));
- cinfo->downsample = (struct jpeg_downsampler *) downsample;
+ cinfo->downsample = (struct jpeg_downsampler *)downsample;
downsample->pub.start_pass = start_pass_downsample;
downsample->pub.downsample = sep_downsample;
downsample->pub.need_context_rows = FALSE;
diff --git a/media/libjpeg/jctrans.c b/media/libjpeg/jctrans.c
index 6f16b052cf..e121028ec7 100644
--- a/media/libjpeg/jctrans.c
+++ b/media/libjpeg/jctrans.c
@@ -4,8 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1998, Thomas G. Lane.
* Modified 2000-2009 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -17,13 +17,14 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
+#include "jpegcomp.h"
/* Forward declarations */
-LOCAL(void) transencode_master_selection
- (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
-LOCAL(void) transencode_coef_controller
- (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
+LOCAL(void) transencode_master_selection(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays);
+LOCAL(void) transencode_coef_controller(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays);
/*
@@ -39,14 +40,14 @@ LOCAL(void) transencode_coef_controller
*/
GLOBAL(void)
-jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
+jpeg_write_coefficients(j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
{
if (cinfo->global_state != CSTATE_START)
ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
/* Mark all tables to be written */
jpeg_suppress_tables(cinfo, FALSE);
/* (Re)initialize error mgr and destination modules */
- (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+ (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
(*cinfo->dest->init_destination) (cinfo);
/* Perform master selection of active modules */
transencode_master_selection(cinfo, coef_arrays);
@@ -64,8 +65,7 @@ jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
*/
GLOBAL(void)
-jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
- j_compress_ptr dstinfo)
+jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo)
{
JQUANT_TBL **qtblptr;
jpeg_component_info *incomp, *outcomp;
@@ -97,12 +97,11 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
/* Copy the source's quantization tables. */
for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
if (srcinfo->quant_tbl_ptrs[tblno] != NULL) {
- qtblptr = & dstinfo->quant_tbl_ptrs[tblno];
+ qtblptr = &dstinfo->quant_tbl_ptrs[tblno];
if (*qtblptr == NULL)
- *qtblptr = jpeg_alloc_quant_table((j_common_ptr) dstinfo);
- MEMCOPY((*qtblptr)->quantval,
- srcinfo->quant_tbl_ptrs[tblno]->quantval,
- sizeof((*qtblptr)->quantval));
+ *qtblptr = jpeg_alloc_quant_table((j_common_ptr)dstinfo);
+ memcpy((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
+ sizeof((*qtblptr)->quantval));
(*qtblptr)->sent_table = FALSE;
}
}
@@ -165,8 +164,8 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
*/
LOCAL(void)
-transencode_master_selection (j_compress_ptr cinfo,
- jvirt_barray_ptr *coef_arrays)
+transencode_master_selection(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays)
{
/* Although we don't actually use input_components for transcoding,
* jcmaster.c's initial_setup will complain if input_components is 0.
@@ -199,7 +198,7 @@ transencode_master_selection (j_compress_ptr cinfo,
jinit_marker_writer(cinfo);
/* We can now tell the memory manager to allocate virtual arrays. */
- (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+ (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
/* Write the datastream header (SOI, JFIF) immediately.
* Frame and scan headers are postponed till later.
@@ -238,10 +237,10 @@ typedef my_coef_controller *my_coef_ptr;
LOCAL(void)
-start_iMCU_row (j_compress_ptr cinfo)
+start_iMCU_row(j_compress_ptr cinfo)
/* Reset within-iMCU-row counters for a new row */
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
/* In an interleaved scan, an MCU row is the same as an iMCU row.
* In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -250,7 +249,7 @@ start_iMCU_row (j_compress_ptr cinfo)
if (cinfo->comps_in_scan > 1) {
coef->MCU_rows_per_iMCU_row = 1;
} else {
- if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1))
+ if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
else
coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
@@ -266,9 +265,9 @@ start_iMCU_row (j_compress_ptr cinfo)
*/
METHODDEF(void)
-start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
if (pass_mode != JBUF_CRANK_DEST)
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -289,9 +288,9 @@ start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
*/
METHODDEF(boolean)
-compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
JDIMENSION MCU_col_num; /* index of current MCU within row */
JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -306,9 +305,9 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
buffer[ci] = (*cinfo->mem->access_virt_barray)
- ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+ ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
coef->iMCU_row_num * compptr->v_samp_factor,
- (JDIMENSION) compptr->v_samp_factor, FALSE);
+ (JDIMENSION)compptr->v_samp_factor, FALSE);
}
/* Loop to process one whole iMCU row */
@@ -321,13 +320,13 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
start_col = MCU_col_num * compptr->MCU_width;
- blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
- : compptr->last_col_width;
+ blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+ compptr->last_col_width;
for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
if (coef->iMCU_row_num < last_iMCU_row ||
- yindex+yoffset < compptr->last_row_height) {
+ yindex + yoffset < compptr->last_row_height) {
/* Fill in pointers to real blocks in this row */
- buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+ buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
for (xindex = 0; xindex < blockcnt; xindex++)
MCU_buffer[blkn++] = buffer_ptr++;
} else {
@@ -342,13 +341,13 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
*/
for (; xindex < compptr->MCU_width; xindex++) {
MCU_buffer[blkn] = coef->dummy_buffer[blkn];
- MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0];
+ MCU_buffer[blkn][0][0] = MCU_buffer[blkn - 1][0][0];
blkn++;
}
}
}
/* Try to write the MCU. */
- if (! (*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
+ if (!(*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
/* Suspension forced; update state counters and exit */
coef->MCU_vert_offset = yoffset;
coef->mcu_ctr = MCU_col_num;
@@ -374,17 +373,17 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
*/
LOCAL(void)
-transencode_coef_controller (j_compress_ptr cinfo,
- jvirt_barray_ptr *coef_arrays)
+transencode_coef_controller(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays)
{
my_coef_ptr coef;
JBLOCKROW buffer;
int i;
coef = (my_coef_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_coef_controller));
- cinfo->coef = (struct jpeg_c_coef_controller *) coef;
+ cinfo->coef = (struct jpeg_c_coef_controller *)coef;
coef->pub.start_pass = start_pass_coef;
coef->pub.compress_data = compress_output;
@@ -393,9 +392,9 @@ transencode_coef_controller (j_compress_ptr cinfo,
/* Allocate and pre-zero space for dummy DCT blocks. */
buffer = (JBLOCKROW)
- (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
- jzero_far((void *) buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
+ jzero_far((void *)buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
coef->dummy_buffer[i] = buffer + i;
}
diff --git a/media/libjpeg/jdapimin.c b/media/libjpeg/jdapimin.c
index f80a14667f..f50c27edc3 100644
--- a/media/libjpeg/jdapimin.c
+++ b/media/libjpeg/jdapimin.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2016, D. R. Commander.
+ * Copyright (C) 2016, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -23,6 +23,7 @@
#include "jinclude.h"
#include "jpeglib.h"
#include "jdmaster.h"
+#include "jconfigint.h"
/*
@@ -31,7 +32,7 @@
*/
GLOBAL(void)
-jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
+jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
{
int i;
@@ -41,7 +42,7 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
if (structsize != sizeof(struct jpeg_decompress_struct))
ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
- (int) sizeof(struct jpeg_decompress_struct), (int) structsize);
+ (int)sizeof(struct jpeg_decompress_struct), (int)structsize);
/* For debugging purposes, we zero the whole master structure.
* But the application has already set the err pointer, and may have set
@@ -50,16 +51,16 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
* complain here.
*/
{
- struct jpeg_error_mgr * err = cinfo->err;
- void * client_data = cinfo->client_data; /* ignore Purify complaint here */
- MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct));
+ struct jpeg_error_mgr *err = cinfo->err;
+ void *client_data = cinfo->client_data; /* ignore Purify complaint here */
+ memset(cinfo, 0, sizeof(struct jpeg_decompress_struct));
cinfo->err = err;
cinfo->client_data = client_data;
}
cinfo->is_decompressor = TRUE;
/* Initialize a memory manager instance for this object */
- jinit_memory_mgr((j_common_ptr) cinfo);
+ jinit_memory_mgr((j_common_ptr)cinfo);
/* Zero out pointers to permanent structures. */
cinfo->progress = NULL;
@@ -89,9 +90,9 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
* here.
*/
cinfo->master = (struct jpeg_decomp_master *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
- sizeof(my_decomp_master));
- MEMZERO(cinfo->master, sizeof(my_decomp_master));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ sizeof(my_decomp_master));
+ memset(cinfo->master, 0, sizeof(my_decomp_master));
}
@@ -100,9 +101,9 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
*/
GLOBAL(void)
-jpeg_destroy_decompress (j_decompress_ptr cinfo)
+jpeg_destroy_decompress(j_decompress_ptr cinfo)
{
- jpeg_destroy((j_common_ptr) cinfo); /* use common routine */
+ jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
}
@@ -112,9 +113,9 @@ jpeg_destroy_decompress (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_abort_decompress (j_decompress_ptr cinfo)
+jpeg_abort_decompress(j_decompress_ptr cinfo)
{
- jpeg_abort((j_common_ptr) cinfo); /* use common routine */
+ jpeg_abort((j_common_ptr)cinfo); /* use common routine */
}
@@ -123,7 +124,7 @@ jpeg_abort_decompress (j_decompress_ptr cinfo)
*/
LOCAL(void)
-default_decompress_parms (j_decompress_ptr cinfo)
+default_decompress_parms(j_decompress_ptr cinfo)
{
/* Guess the input colorspace, and set output colorspace accordingly. */
/* (Wish JPEG committee had provided a real way to specify this...) */
@@ -250,7 +251,7 @@ default_decompress_parms (j_decompress_ptr cinfo)
*/
GLOBAL(int)
-jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
+jpeg_read_header(j_decompress_ptr cinfo, boolean require_image)
{
int retcode;
@@ -271,7 +272,7 @@ jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
* call jpeg_abort, but we can't change it now for compatibility reasons.
* A side effect is to free any temporary memory (there shouldn't be any).
*/
- jpeg_abort((j_common_ptr) cinfo); /* sets state = DSTATE_START */
+ jpeg_abort((j_common_ptr)cinfo); /* sets state = DSTATE_START */
retcode = JPEG_HEADER_TABLES_ONLY;
break;
case JPEG_SUSPENDED:
@@ -296,7 +297,7 @@ jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
*/
GLOBAL(int)
-jpeg_consume_input (j_decompress_ptr cinfo)
+jpeg_consume_input(j_decompress_ptr cinfo)
{
int retcode = JPEG_SUSPENDED;
@@ -308,7 +309,7 @@ jpeg_consume_input (j_decompress_ptr cinfo)
/* Initialize application's data source module */
(*cinfo->src->init_source) (cinfo);
cinfo->global_state = DSTATE_INHEADER;
- /*FALLTHROUGH*/
+ FALLTHROUGH /*FALLTHROUGH*/
case DSTATE_INHEADER:
retcode = (*cinfo->inputctl->consume_input) (cinfo);
if (retcode == JPEG_REACHED_SOS) { /* Found SOS, prepare to decompress */
@@ -343,7 +344,7 @@ jpeg_consume_input (j_decompress_ptr cinfo)
*/
GLOBAL(boolean)
-jpeg_input_complete (j_decompress_ptr cinfo)
+jpeg_input_complete(j_decompress_ptr cinfo)
{
/* Check for valid jpeg object */
if (cinfo->global_state < DSTATE_START ||
@@ -358,7 +359,7 @@ jpeg_input_complete (j_decompress_ptr cinfo)
*/
GLOBAL(boolean)
-jpeg_has_multiple_scans (j_decompress_ptr cinfo)
+jpeg_has_multiple_scans(j_decompress_ptr cinfo)
{
/* Only valid after jpeg_read_header completes */
if (cinfo->global_state < DSTATE_READY ||
@@ -378,10 +379,10 @@ jpeg_has_multiple_scans (j_decompress_ptr cinfo)
*/
GLOBAL(boolean)
-jpeg_finish_decompress (j_decompress_ptr cinfo)
+jpeg_finish_decompress(j_decompress_ptr cinfo)
{
if ((cinfo->global_state == DSTATE_SCANNING ||
- cinfo->global_state == DSTATE_RAW_OK) && ! cinfo->buffered_image) {
+ cinfo->global_state == DSTATE_RAW_OK) && !cinfo->buffered_image) {
/* Terminate final pass of non-buffered mode */
if (cinfo->output_scanline < cinfo->output_height)
ERREXIT(cinfo, JERR_TOO_LITTLE_DATA);
@@ -395,13 +396,13 @@ jpeg_finish_decompress (j_decompress_ptr cinfo)
ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
}
/* Read until EOI */
- while (! cinfo->inputctl->eoi_reached) {
+ while (!cinfo->inputctl->eoi_reached) {
if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
return FALSE; /* Suspend, come back later */
}
/* Do final cleanup */
(*cinfo->src->term_source) (cinfo);
/* We can use jpeg_abort to release memory and reset global_state */
- jpeg_abort((j_common_ptr) cinfo);
+ jpeg_abort((j_common_ptr)cinfo);
return TRUE;
}
diff --git a/media/libjpeg/jdapistd.c b/media/libjpeg/jdapistd.c
index 37afc8448b..8827d8abf5 100644
--- a/media/libjpeg/jdapistd.c
+++ b/media/libjpeg/jdapistd.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1996, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010, 2015-2020, 2022, D. R. Commander.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
@@ -21,11 +21,13 @@
#include "jinclude.h"
#include "jdmainct.h"
#include "jdcoefct.h"
+#include "jdmaster.h"
+#include "jdmerge.h"
#include "jdsample.h"
#include "jmemsys.h"
/* Forward declarations */
-LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
+LOCAL(boolean) output_pass_setup(j_decompress_ptr cinfo);
/*
@@ -40,7 +42,7 @@ LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
*/
GLOBAL(boolean)
-jpeg_start_decompress (j_decompress_ptr cinfo)
+jpeg_start_decompress(j_decompress_ptr cinfo)
{
if (cinfo->global_state == DSTATE_READY) {
/* First call: initialize master control, select active modules */
@@ -60,7 +62,7 @@ jpeg_start_decompress (j_decompress_ptr cinfo)
int retcode;
/* Call progress monitor hook if present */
if (cinfo->progress != NULL)
- (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
/* Absorb some more input */
retcode = (*cinfo->inputctl->consume_input) (cinfo);
if (retcode == JPEG_SUSPENDED)
@@ -72,7 +74,7 @@ jpeg_start_decompress (j_decompress_ptr cinfo)
(retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
/* jdmaster underestimated number of scans; ratchet up one scan */
- cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+ cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
}
}
}
@@ -97,7 +99,7 @@ jpeg_start_decompress (j_decompress_ptr cinfo)
*/
LOCAL(boolean)
-output_pass_setup (j_decompress_ptr cinfo)
+output_pass_setup(j_decompress_ptr cinfo)
{
if (cinfo->global_state != DSTATE_PRESCAN) {
/* First call: do pass setup */
@@ -113,14 +115,14 @@ output_pass_setup (j_decompress_ptr cinfo)
JDIMENSION last_scanline;
/* Call progress monitor hook if present */
if (cinfo->progress != NULL) {
- cinfo->progress->pass_counter = (long) cinfo->output_scanline;
- cinfo->progress->pass_limit = (long) cinfo->output_height;
- (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+ cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->output_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
}
/* Process some data */
last_scanline = cinfo->output_scanline;
- (*cinfo->main->process_data) (cinfo, (JSAMPARRAY) NULL,
- &cinfo->output_scanline, (JDIMENSION) 0);
+ (*cinfo->main->process_data) (cinfo, (JSAMPARRAY)NULL,
+ &cinfo->output_scanline, (JDIMENSION)0);
if (cinfo->output_scanline == last_scanline)
return FALSE; /* No progress made, must suspend */
}
@@ -150,13 +152,14 @@ output_pass_setup (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
- JDIMENSION *width)
+jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+ JDIMENSION *width)
{
int ci, align, orig_downsampled_width;
JDIMENSION input_xoffset;
boolean reinit_upsampler = FALSE;
jpeg_component_info *compptr;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0)
ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -190,7 +193,10 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
* single-pass decompression case, allowing us to use the same MCU column
* width for all of the components.
*/
- align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor;
+ if (cinfo->comps_in_scan == 1 && cinfo->num_components == 1)
+ align = cinfo->_min_DCT_scaled_size;
+ else
+ align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor;
/* Adjust xoffset to the nearest iMCU boundary <= the requested value */
input_xoffset = *xoffset;
@@ -203,24 +209,31 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
*/
*width = *width + input_xoffset - *xoffset;
cinfo->output_width = *width;
+ if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ upsample->out_row_width =
+ cinfo->output_width * cinfo->out_color_components;
+ }
/* Set the first and last iMCU columns that we must decompress. These values
* will be used in single-scan decompressions.
*/
- cinfo->master->first_iMCU_col =
- (JDIMENSION) (long) (*xoffset) / (long) align;
+ cinfo->master->first_iMCU_col = (JDIMENSION)(long)(*xoffset) / (long)align;
cinfo->master->last_iMCU_col =
- (JDIMENSION) jdiv_round_up((long) (*xoffset + cinfo->output_width),
- (long) align) - 1;
+ (JDIMENSION)jdiv_round_up((long)(*xoffset + cinfo->output_width),
+ (long)align) - 1;
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
+ int hsf = (cinfo->comps_in_scan == 1 && cinfo->num_components == 1) ?
+ 1 : compptr->h_samp_factor;
+
/* Set downsampled_width to the new output width. */
orig_downsampled_width = compptr->downsampled_width;
compptr->downsampled_width =
- (JDIMENSION) jdiv_round_up((long) (cinfo->output_width *
- compptr->h_samp_factor),
- (long) cinfo->max_h_samp_factor);
+ (JDIMENSION)jdiv_round_up((long)(cinfo->output_width *
+ compptr->h_samp_factor),
+ (long)cinfo->max_h_samp_factor);
if (compptr->downsampled_width < 2 && orig_downsampled_width >= 2)
reinit_upsampler = TRUE;
@@ -228,12 +241,10 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
* values will be used in multi-scan decompressions.
*/
cinfo->master->first_MCU_col[ci] =
- (JDIMENSION) (long) (*xoffset * compptr->h_samp_factor) /
- (long) align;
+ (JDIMENSION)(long)(*xoffset * hsf) / (long)align;
cinfo->master->last_MCU_col[ci] =
- (JDIMENSION) jdiv_round_up((long) ((*xoffset + cinfo->output_width) *
- compptr->h_samp_factor),
- (long) align) - 1;
+ (JDIMENSION)jdiv_round_up((long)((*xoffset + cinfo->output_width) * hsf),
+ (long)align) - 1;
}
if (reinit_upsampler) {
@@ -258,8 +269,8 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
*/
GLOBAL(JDIMENSION)
-jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
- JDIMENSION max_lines)
+jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+ JDIMENSION max_lines)
{
JDIMENSION row_ctr;
@@ -272,9 +283,9 @@ jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
/* Call progress monitor hook if present */
if (cinfo->progress != NULL) {
- cinfo->progress->pass_counter = (long) cinfo->output_scanline;
- cinfo->progress->pass_limit = (long) cinfo->output_height;
- (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+ cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->output_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
}
/* Process some data */
@@ -287,8 +298,16 @@ jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
/* Dummy color convert function used by jpeg_skip_scanlines() */
LOCAL(void)
-noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+noop_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+
+/* Dummy quantize function used by jpeg_skip_scanlines() */
+LOCAL(void)
+noop_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
{
}
@@ -302,20 +321,46 @@ noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
*/
LOCAL(void)
-read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
{
JDIMENSION n;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+ JSAMPLE dummy_sample[1] = { 0 };
+ JSAMPROW dummy_row = dummy_sample;
+ JSAMPARRAY scanlines = NULL;
void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
- int num_rows);
+ int num_rows) = NULL;
+ void (*color_quantize) (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows) = NULL;
+
+ if (cinfo->cconvert && cinfo->cconvert->color_convert) {
+ color_convert = cinfo->cconvert->color_convert;
+ cinfo->cconvert->color_convert = noop_convert;
+ /* This just prevents UBSan from complaining about adding 0 to a NULL
+ * pointer. The pointer isn't actually used.
+ */
+ scanlines = &dummy_row;
+ }
+
+ if (cinfo->cquantize && cinfo->cquantize->color_quantize) {
+ color_quantize = cinfo->cquantize->color_quantize;
+ cinfo->cquantize->color_quantize = noop_quantize;
+ }
- color_convert = cinfo->cconvert->color_convert;
- cinfo->cconvert->color_convert = noop_convert;
+ if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ scanlines = &upsample->spare_row;
+ }
for (n = 0; n < num_lines; n++)
- jpeg_read_scanlines(cinfo, NULL, 1);
+ jpeg_read_scanlines(cinfo, scanlines, 1);
- cinfo->cconvert->color_convert = color_convert;
+ if (color_convert)
+ cinfo->cconvert->color_convert = color_convert;
+
+ if (color_quantize)
+ cinfo->cquantize->color_quantize = color_quantize;
}
@@ -325,10 +370,16 @@ read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
*/
LOCAL(void)
-increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows)
+increment_simple_rowgroup_ctr(j_decompress_ptr cinfo, JDIMENSION rows)
{
JDIMENSION rows_left;
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+
+ if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+ read_and_discard_scanlines(cinfo, rows);
+ return;
+ }
/* Increment the counter to the next row group after the skipped rows. */
main_ptr->rowgroup_ctr += rows / cinfo->max_v_samp_factor;
@@ -354,23 +405,31 @@ increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows)
*/
GLOBAL(JDIMENSION)
-jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
JDIMENSION i, x;
int y;
JDIMENSION lines_per_iMCU_row, lines_left_in_iMCU_row, lines_after_iMCU_row;
JDIMENSION lines_to_skip, lines_to_read;
+ /* Two-pass color quantization is not supported. */
+ if (cinfo->quantize_colors && cinfo->two_pass_quantize)
+ ERREXIT(cinfo, JERR_NOTIMPL);
+
if (cinfo->global_state != DSTATE_SCANNING)
ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
/* Do not skip past the bottom of the image. */
if (cinfo->output_scanline + num_lines >= cinfo->output_height) {
+ num_lines = cinfo->output_height - cinfo->output_scanline;
cinfo->output_scanline = cinfo->output_height;
- return cinfo->output_height - cinfo->output_scanline;
+ (*cinfo->inputctl->finish_input_pass) (cinfo);
+ cinfo->inputctl->eoi_reached = TRUE;
+ return num_lines;
}
if (num_lines == 0)
@@ -419,8 +478,10 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
main_ptr->buffer_full = FALSE;
main_ptr->rowgroup_ctr = 0;
main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
- upsample->next_row_out = cinfo->max_v_samp_factor;
- upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ if (!master->using_merged_upsample) {
+ upsample->next_row_out = cinfo->max_v_samp_factor;
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ }
}
/* Skipping is much simpler when context rows are not required. */
@@ -432,8 +493,10 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
cinfo->output_scanline += lines_left_in_iMCU_row;
main_ptr->buffer_full = FALSE;
main_ptr->rowgroup_ctr = 0;
- upsample->next_row_out = cinfo->max_v_samp_factor;
- upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ if (!master->using_merged_upsample) {
+ upsample->next_row_out = cinfo->max_v_samp_factor;
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ }
}
}
@@ -458,7 +521,7 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
if (cinfo->upsample->need_context_rows) {
cinfo->output_scanline += lines_to_skip;
cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
- main_ptr->iMCU_row_ctr += lines_after_iMCU_row / lines_per_iMCU_row;
+ main_ptr->iMCU_row_ctr += lines_to_skip / lines_per_iMCU_row;
/* It is complex to properly move to the middle of a context block, so
* read the remaining lines instead of skipping them.
*/
@@ -468,7 +531,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
increment_simple_rowgroup_ctr(cinfo, lines_to_read);
}
- upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ if (!master->using_merged_upsample)
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
return num_lines;
}
@@ -480,6 +544,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
* decoded coefficients. This is ~5% faster for large subsets, but
* it's tough to tell a difference for smaller images.
*/
+ if (!cinfo->entropy->insufficient_data)
+ cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
(*cinfo->entropy->decode_mcu) (cinfo, NULL);
}
}
@@ -509,7 +575,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
* bit odd, since "rows_to_go" seems to be redundantly keeping track of
* output_scanline.
*/
- upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ if (!master->using_merged_upsample)
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
/* Always skip the requested number of lines. */
return num_lines;
@@ -521,8 +588,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
*/
GLOBAL(JDIMENSION)
-jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
- JDIMENSION max_lines)
+jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+ JDIMENSION max_lines)
{
JDIMENSION lines_per_iMCU_row;
@@ -535,9 +602,9 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
/* Call progress monitor hook if present */
if (cinfo->progress != NULL) {
- cinfo->progress->pass_counter = (long) cinfo->output_scanline;
- cinfo->progress->pass_limit = (long) cinfo->output_height;
- (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+ cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->output_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
}
/* Verify that at least one iMCU row can be returned. */
@@ -546,7 +613,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
ERREXIT(cinfo, JERR_BUFFER_SIZE);
/* Decompress directly into user's buffer. */
- if (! (*cinfo->coef->decompress_data) (cinfo, data))
+ if (!(*cinfo->coef->decompress_data) (cinfo, data))
return 0; /* suspension forced, can do nothing more */
/* OK, we processed one iMCU row. */
@@ -564,7 +631,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
*/
GLOBAL(boolean)
-jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
+jpeg_start_output(j_decompress_ptr cinfo, int scan_number)
{
if (cinfo->global_state != DSTATE_BUFIMAGE &&
cinfo->global_state != DSTATE_PRESCAN)
@@ -572,8 +639,7 @@ jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
/* Limit scan number to valid range */
if (scan_number <= 0)
scan_number = 1;
- if (cinfo->inputctl->eoi_reached &&
- scan_number > cinfo->input_scan_number)
+ if (cinfo->inputctl->eoi_reached && scan_number > cinfo->input_scan_number)
scan_number = cinfo->input_scan_number;
cinfo->output_scan_number = scan_number;
/* Perform any dummy output passes, and set up for the real pass */
@@ -589,7 +655,7 @@ jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
*/
GLOBAL(boolean)
-jpeg_finish_output (j_decompress_ptr cinfo)
+jpeg_finish_output(j_decompress_ptr cinfo)
{
if ((cinfo->global_state == DSTATE_SCANNING ||
cinfo->global_state == DSTATE_RAW_OK) && cinfo->buffered_image) {
@@ -603,7 +669,7 @@ jpeg_finish_output (j_decompress_ptr cinfo)
}
/* Read markers looking for SOS or EOI */
while (cinfo->input_scan_number <= cinfo->output_scan_number &&
- ! cinfo->inputctl->eoi_reached) {
+ !cinfo->inputctl->eoi_reached) {
if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
return FALSE; /* Suspend, come back later */
}
diff --git a/media/libjpeg/jdarith.c b/media/libjpeg/jdarith.c
index df3540eef5..21575e80c7 100644
--- a/media/libjpeg/jdarith.c
+++ b/media/libjpeg/jdarith.c
@@ -4,16 +4,19 @@
* This file was part of the Independent JPEG Group's software:
* Developed 1997-2015 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2020, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
- * This file contains portable arithmetic entropy decoding routines for JPEG
- * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing Recommendation ITU-T T.81 | ISO/IEC 10918-1).
*
* Both sequential and progressive modes are supported in this single module.
*
* Suspension is not currently supported in this module.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
@@ -21,6 +24,9 @@
#include "jpeglib.h"
+#define NEG_1 ((unsigned int)-1)
+
+
/* Expanded entropy decoder object for arithmetic decoding. */
typedef struct {
@@ -60,21 +66,21 @@ typedef arith_entropy_decoder *arith_entropy_ptr;
* in the lower bits (mask 0x7F).
*/
-#define DC_STAT_BINS 64
-#define AC_STAT_BINS 256
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
LOCAL(int)
-get_byte (j_decompress_ptr cinfo)
+get_byte(j_decompress_ptr cinfo)
/* Read next input byte; we do not support suspension in this module. */
{
struct jpeg_source_mgr *src = cinfo->src;
if (src->bytes_in_buffer == 0)
- if (! (*src->fill_input_buffer) (cinfo))
+ if (!(*src->fill_input_buffer) (cinfo))
ERREXIT(cinfo, JERR_CANT_SUSPEND);
src->bytes_in_buffer--;
- return GETJOCTET(*src->next_input_byte++);
+ return *src->next_input_byte++;
}
@@ -106,9 +112,9 @@ get_byte (j_decompress_ptr cinfo)
*/
LOCAL(int)
-arith_decode (j_decompress_ptr cinfo, unsigned char *st)
+arith_decode(j_decompress_ptr cinfo, unsigned char *st)
{
- register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+ register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
register unsigned char nl, nm;
register JLONG qe, temp;
register int sv, data;
@@ -153,8 +159,8 @@ arith_decode (j_decompress_ptr cinfo, unsigned char *st)
*/
sv = *st;
qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */
- nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */
- nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
+ nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */
+ nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
/* Decode & estimation procedures per sections D.2.4 & D.2.5 */
temp = e->a - qe;
@@ -190,27 +196,27 @@ arith_decode (j_decompress_ptr cinfo, unsigned char *st)
*/
LOCAL(void)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
int ci;
jpeg_component_info *compptr;
/* Advance past the RSTn marker */
- if (! (*cinfo->marker->read_restart_marker) (cinfo))
+ if (!(*cinfo->marker->read_restart_marker) (cinfo))
ERREXIT(cinfo, JERR_CANT_SUSPEND);
/* Re-initialize statistics areas */
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
- MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+ memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
/* Reset DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
}
if (!cinfo->progressive_mode || cinfo->Ss) {
- MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+ memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
}
}
@@ -241,9 +247,9 @@ process_restart (j_decompress_ptr cinfo)
*/
METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
JBLOCKROW block;
unsigned char *st;
int blkn, ci, tbl, sign;
@@ -277,7 +283,7 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
/* Figure F.21: Decoding nonzero value v */
/* Figure F.22: Decoding the sign of v */
sign = arith_decode(cinfo, st + 1);
- st += 2; st += sign;
+ st += 2; st += sign;
/* Figure F.23: Decoding the magnitude category of v */
if ((m = arith_decode(cinfo, st)) != 0) {
st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
@@ -291,9 +297,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
}
}
/* Section F.1.4.4.1.2: Establish dc_context conditioning category */
- if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+ if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
entropy->dc_context[ci] = 0; /* zero diff category */
- else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+ else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
else
entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */
@@ -302,12 +308,12 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
st += 14;
while (m >>= 1)
if (arith_decode(cinfo, st)) v |= m;
- v += 1; if (sign) v = -v;
- entropy->last_dc_val[ci] += v;
+ v += 1; if (sign) v = -v;
+ entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
}
/* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
- (*block)[0] = (JCOEF) LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
+ (*block)[0] = (JCOEF)LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
}
return TRUE;
@@ -320,9 +326,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
JBLOCKROW block;
unsigned char *st;
int tbl, sign, k;
@@ -348,7 +354,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
st = entropy->ac_stats[tbl] + 3 * (k - 1);
if (arith_decode(cinfo, st)) break; /* EOB flag */
while (arith_decode(cinfo, st + 1) == 0) {
- st += 3; k++;
+ st += 3; k++;
if (k > cinfo->Se) {
WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
entropy->ct = -1; /* spectral overflow */
@@ -380,9 +386,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
st += 14;
while (m >>= 1)
if (arith_decode(cinfo, st)) v |= m;
- v += 1; if (sign) v = -v;
+ v += 1; if (sign) v = -v;
/* Scale and output coefficient in natural (dezigzagged) order */
- (*block)[jpeg_natural_order[k]] = (JCOEF) ((unsigned)v << cinfo->Al);
+ (*block)[jpeg_natural_order[k]] = (JCOEF)((unsigned)v << cinfo->Al);
}
return TRUE;
@@ -394,9 +400,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
unsigned char *st;
int p1, blkn;
@@ -427,9 +433,9 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
JBLOCKROW block;
JCOEFPTR thiscoef;
unsigned char *st;
@@ -450,7 +456,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */
- m1 = (-1) << cinfo->Al; /* -1 in the bit position being coded */
+ m1 = (NEG_1) << cinfo->Al; /* -1 in the bit position being coded */
/* Establish EOBx (previous stage end-of-block) index */
for (kex = cinfo->Se; kex > 0; kex--)
@@ -465,20 +471,20 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (*thiscoef) { /* previously nonzero coef */
if (arith_decode(cinfo, st + 2)) {
if (*thiscoef < 0)
- *thiscoef += m1;
+ *thiscoef += (JCOEF)m1;
else
- *thiscoef += p1;
+ *thiscoef += (JCOEF)p1;
}
break;
}
if (arith_decode(cinfo, st + 1)) { /* newly nonzero coef */
if (arith_decode(cinfo, entropy->fixed_bin))
- *thiscoef = m1;
+ *thiscoef = (JCOEF)m1;
else
- *thiscoef = p1;
+ *thiscoef = (JCOEF)p1;
break;
}
- st += 3; k++;
+ st += 3; k++;
if (k > cinfo->Se) {
WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
entropy->ct = -1; /* spectral overflow */
@@ -496,9 +502,9 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
jpeg_component_info *compptr;
JBLOCKROW block;
unsigned char *st;
@@ -535,7 +541,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
/* Figure F.21: Decoding nonzero value v */
/* Figure F.22: Decoding the sign of v */
sign = arith_decode(cinfo, st + 1);
- st += 2; st += sign;
+ st += 2; st += sign;
/* Figure F.23: Decoding the magnitude category of v */
if ((m = arith_decode(cinfo, st)) != 0) {
st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
@@ -549,9 +555,9 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
}
}
/* Section F.1.4.4.1.2: Establish dc_context conditioning category */
- if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+ if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
entropy->dc_context[ci] = 0; /* zero diff category */
- else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+ else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
else
entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */
@@ -560,12 +566,12 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
st += 14;
while (m >>= 1)
if (arith_decode(cinfo, st)) v |= m;
- v += 1; if (sign) v = -v;
- entropy->last_dc_val[ci] += v;
+ v += 1; if (sign) v = -v;
+ entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
}
if (block)
- (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+ (*block)[0] = (JCOEF)entropy->last_dc_val[ci];
/* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
@@ -576,7 +582,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
st = entropy->ac_stats[tbl] + 3 * (k - 1);
if (arith_decode(cinfo, st)) break; /* EOB flag */
while (arith_decode(cinfo, st + 1) == 0) {
- st += 3; k++;
+ st += 3; k++;
if (k > DCTSIZE2 - 1) {
WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
entropy->ct = -1; /* spectral overflow */
@@ -608,9 +614,9 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
st += 14;
while (m >>= 1)
if (arith_decode(cinfo, st)) v |= m;
- v += 1; if (sign) v = -v;
+ v += 1; if (sign) v = -v;
if (block)
- (*block)[jpeg_natural_order[k]] = (JCOEF) v;
+ (*block)[jpeg_natural_order[k]] = (JCOEF)v;
}
}
@@ -623,9 +629,9 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(void)
-start_pass (j_decompress_ptr cinfo)
+start_pass(j_decompress_ptr cinfo)
{
- arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
int ci, tbl;
jpeg_component_info *compptr;
@@ -644,11 +650,11 @@ start_pass (j_decompress_ptr cinfo)
}
if (cinfo->Ah != 0) {
/* Successive approximation refinement scan: must have Al = Ah-1. */
- if (cinfo->Ah-1 != cinfo->Al)
+ if (cinfo->Ah - 1 != cinfo->Al)
goto bad;
}
if (cinfo->Al > 13) { /* need not check for < 0 */
- bad:
+bad:
ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
}
@@ -658,9 +664,17 @@ start_pass (j_decompress_ptr cinfo)
*/
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
- int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+ int *coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+ int *prev_coef_bit_ptr =
+ &cinfo->coef_bits[cindex + cinfo->num_components][0];
if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+ for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+ if (cinfo->input_scan_number > 1)
+ prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+ else
+ prev_coef_bit_ptr[coefi] = 0;
+ }
for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
if (cinfo->Ah != expected)
@@ -684,8 +698,8 @@ start_pass (j_decompress_ptr cinfo)
/* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
* This ought to be an error condition, but we make it a warning.
*/
- if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
- (cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
+ if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
+ cinfo->Ah != 0 || cinfo->Al != 0)
WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
/* Select MCU decoding routine */
entropy->pub.decode_mcu = decode_mcu;
@@ -699,9 +713,9 @@ start_pass (j_decompress_ptr cinfo)
if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
if (entropy->dc_stats[tbl] == NULL)
- entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
- MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+ entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+ memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
/* Initialize DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
@@ -711,9 +725,9 @@ start_pass (j_decompress_ptr cinfo)
if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
if (entropy->ac_stats[tbl] == NULL)
- entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
- MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+ entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+ memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
}
}
@@ -721,6 +735,7 @@ start_pass (j_decompress_ptr cinfo)
entropy->c = 0;
entropy->a = 0;
entropy->ct = -16; /* force reading 2 initial bytes to fill C */
+ entropy->pub.insufficient_data = FALSE;
/* Initialize restart counter */
entropy->restarts_to_go = cinfo->restart_interval;
@@ -732,15 +747,15 @@ start_pass (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jinit_arith_decoder (j_decompress_ptr cinfo)
+jinit_arith_decoder(j_decompress_ptr cinfo)
{
arith_entropy_ptr entropy;
int i;
entropy = (arith_entropy_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(arith_entropy_decoder));
- cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+ cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
entropy->pub.start_pass = start_pass;
/* Mark tables unallocated */
@@ -756,9 +771,10 @@ jinit_arith_decoder (j_decompress_ptr cinfo)
/* Create progression status table */
int *coef_bit_ptr, ci;
cinfo->coef_bits = (int (*)[DCTSIZE2])
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- cinfo->num_components*DCTSIZE2*sizeof(int));
- coef_bit_ptr = & cinfo->coef_bits[0][0];
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ cinfo->num_components * 2 * DCTSIZE2 *
+ sizeof(int));
+ coef_bit_ptr = &cinfo->coef_bits[0][0];
for (ci = 0; ci < cinfo->num_components; ci++)
for (i = 0; i < DCTSIZE2; i++)
*coef_bit_ptr++ = -1;
diff --git a/media/libjpeg/jdatadst.c b/media/libjpeg/jdatadst.c
index dcaf6f0f96..6b4fed2339 100644
--- a/media/libjpeg/jdatadst.c
+++ b/media/libjpeg/jdatadst.c
@@ -5,7 +5,7 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* Modified 2009-2012 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2013, 2016, D. R. Commander.
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -23,11 +23,6 @@
#include "jpeglib.h"
#include "jerror.h"
-#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc (size_t size);
-extern void free (void *ptr);
-#endif
-
/* Expanded data destination object for stdio output */
@@ -66,14 +61,14 @@ typedef my_mem_destination_mgr *my_mem_dest_ptr;
*/
METHODDEF(void)
-init_destination (j_compress_ptr cinfo)
+init_destination(j_compress_ptr cinfo)
{
- my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+ my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
/* Allocate the output buffer --- it will be released when done with image */
dest->buffer = (JOCTET *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- OUTPUT_BUF_SIZE * sizeof(JOCTET));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ OUTPUT_BUF_SIZE * sizeof(JOCTET));
dest->pub.next_output_byte = dest->buffer;
dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
@@ -81,7 +76,7 @@ init_destination (j_compress_ptr cinfo)
#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
METHODDEF(void)
-init_mem_destination (j_compress_ptr cinfo)
+init_mem_destination(j_compress_ptr cinfo)
{
/* no work necessary here */
}
@@ -112,12 +107,12 @@ init_mem_destination (j_compress_ptr cinfo)
*/
METHODDEF(boolean)
-empty_output_buffer (j_compress_ptr cinfo)
+empty_output_buffer(j_compress_ptr cinfo)
{
- my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+ my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
- if (JFWRITE(dest->outfile, dest->buffer, OUTPUT_BUF_SIZE) !=
- (size_t) OUTPUT_BUF_SIZE)
+ if (fwrite(dest->buffer, 1, OUTPUT_BUF_SIZE, dest->outfile) !=
+ (size_t)OUTPUT_BUF_SIZE)
ERREXIT(cinfo, JERR_FILE_WRITE);
dest->pub.next_output_byte = dest->buffer;
@@ -128,23 +123,22 @@ empty_output_buffer (j_compress_ptr cinfo)
#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
METHODDEF(boolean)
-empty_mem_output_buffer (j_compress_ptr cinfo)
+empty_mem_output_buffer(j_compress_ptr cinfo)
{
size_t nextsize;
JOCTET *nextbuffer;
- my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+ my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
/* Try to allocate new buffer with double size */
nextsize = dest->bufsize * 2;
- nextbuffer = (JOCTET *) malloc(nextsize);
+ nextbuffer = (JOCTET *)malloc(nextsize);
if (nextbuffer == NULL)
ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
- MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
+ memcpy(nextbuffer, dest->buffer, dest->bufsize);
- if (dest->newbuffer != NULL)
- free(dest->newbuffer);
+ free(dest->newbuffer);
dest->newbuffer = nextbuffer;
@@ -169,14 +163,14 @@ empty_mem_output_buffer (j_compress_ptr cinfo)
*/
METHODDEF(void)
-term_destination (j_compress_ptr cinfo)
+term_destination(j_compress_ptr cinfo)
{
- my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+ my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
size_t datacount = OUTPUT_BUF_SIZE - dest->pub.free_in_buffer;
/* Write any data remaining in the buffer */
if (datacount > 0) {
- if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount)
+ if (fwrite(dest->buffer, 1, datacount, dest->outfile) != datacount)
ERREXIT(cinfo, JERR_FILE_WRITE);
}
fflush(dest->outfile);
@@ -187,9 +181,9 @@ term_destination (j_compress_ptr cinfo)
#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
METHODDEF(void)
-term_mem_destination (j_compress_ptr cinfo)
+term_mem_destination(j_compress_ptr cinfo)
{
- my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+ my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
*dest->outbuffer = dest->buffer;
*dest->outsize = (unsigned long)(dest->bufsize - dest->pub.free_in_buffer);
@@ -204,7 +198,7 @@ term_mem_destination (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
+jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile)
{
my_dest_ptr dest;
@@ -213,7 +207,7 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
*/
if (cinfo->dest == NULL) { /* first time for this JPEG object? */
cinfo->dest = (struct jpeg_destination_mgr *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
sizeof(my_destination_mgr));
} else if (cinfo->dest->init_destination != init_destination) {
/* It is unsafe to reuse the existing destination manager unless it was
@@ -225,7 +219,7 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
ERREXIT(cinfo, JERR_BUFFER_SIZE);
}
- dest = (my_dest_ptr) cinfo->dest;
+ dest = (my_dest_ptr)cinfo->dest;
dest->pub.init_destination = init_destination;
dest->pub.empty_output_buffer = empty_output_buffer;
dest->pub.term_destination = term_destination;
@@ -249,8 +243,8 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
*/
GLOBAL(void)
-jpeg_mem_dest (j_compress_ptr cinfo,
- unsigned char **outbuffer, unsigned long *outsize)
+jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+ unsigned long *outsize)
{
my_mem_dest_ptr dest;
@@ -262,7 +256,7 @@ jpeg_mem_dest (j_compress_ptr cinfo,
*/
if (cinfo->dest == NULL) { /* first time for this JPEG object? */
cinfo->dest = (struct jpeg_destination_mgr *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
sizeof(my_mem_destination_mgr));
} else if (cinfo->dest->init_destination != init_mem_destination) {
/* It is unsafe to reuse the existing destination manager unless it was
@@ -271,7 +265,7 @@ jpeg_mem_dest (j_compress_ptr cinfo,
ERREXIT(cinfo, JERR_BUFFER_SIZE);
}
- dest = (my_mem_dest_ptr) cinfo->dest;
+ dest = (my_mem_dest_ptr)cinfo->dest;
dest->pub.init_destination = init_mem_destination;
dest->pub.empty_output_buffer = empty_mem_output_buffer;
dest->pub.term_destination = term_mem_destination;
@@ -281,7 +275,7 @@ jpeg_mem_dest (j_compress_ptr cinfo,
if (*outbuffer == NULL || *outsize == 0) {
/* Allocate initial buffer */
- dest->newbuffer = *outbuffer = (unsigned char *) malloc(OUTPUT_BUF_SIZE);
+ dest->newbuffer = *outbuffer = (unsigned char *)malloc(OUTPUT_BUF_SIZE);
if (dest->newbuffer == NULL)
ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
*outsize = OUTPUT_BUF_SIZE;
diff --git a/media/libjpeg/jdatasrc.c b/media/libjpeg/jdatasrc.c
index c83183fe19..e36a30d894 100644
--- a/media/libjpeg/jdatasrc.c
+++ b/media/libjpeg/jdatasrc.c
@@ -5,7 +5,7 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* Modified 2009-2011 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2013, 2016, D. R. Commander.
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -45,9 +45,9 @@ typedef my_source_mgr *my_src_ptr;
*/
METHODDEF(void)
-init_source (j_decompress_ptr cinfo)
+init_source(j_decompress_ptr cinfo)
{
- my_src_ptr src = (my_src_ptr) cinfo->src;
+ my_src_ptr src = (my_src_ptr)cinfo->src;
/* We reset the empty-input-file flag for each image,
* but we don't clear the input buffer.
@@ -58,7 +58,7 @@ init_source (j_decompress_ptr cinfo)
#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
METHODDEF(void)
-init_mem_source (j_decompress_ptr cinfo)
+init_mem_source(j_decompress_ptr cinfo)
{
/* no work necessary here */
}
@@ -99,20 +99,20 @@ init_mem_source (j_decompress_ptr cinfo)
*/
METHODDEF(boolean)
-fill_input_buffer (j_decompress_ptr cinfo)
+fill_input_buffer(j_decompress_ptr cinfo)
{
- my_src_ptr src = (my_src_ptr) cinfo->src;
+ my_src_ptr src = (my_src_ptr)cinfo->src;
size_t nbytes;
- nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE);
+ nbytes = fread(src->buffer, 1, INPUT_BUF_SIZE, src->infile);
if (nbytes <= 0) {
if (src->start_of_file) /* Treat empty input file as fatal error */
ERREXIT(cinfo, JERR_INPUT_EMPTY);
WARNMS(cinfo, JWRN_JPEG_EOF);
/* Insert a fake EOI marker */
- src->buffer[0] = (JOCTET) 0xFF;
- src->buffer[1] = (JOCTET) JPEG_EOI;
+ src->buffer[0] = (JOCTET)0xFF;
+ src->buffer[1] = (JOCTET)JPEG_EOI;
nbytes = 2;
}
@@ -125,10 +125,10 @@ fill_input_buffer (j_decompress_ptr cinfo)
#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
METHODDEF(boolean)
-fill_mem_input_buffer (j_decompress_ptr cinfo)
+fill_mem_input_buffer(j_decompress_ptr cinfo)
{
static const JOCTET mybuffer[4] = {
- (JOCTET) 0xFF, (JOCTET) JPEG_EOI, 0, 0
+ (JOCTET)0xFF, (JOCTET)JPEG_EOI, 0, 0
};
/* The whole JPEG data is expected to reside in the supplied memory
@@ -160,7 +160,7 @@ fill_mem_input_buffer (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-skip_input_data (j_decompress_ptr cinfo, long num_bytes)
+skip_input_data(j_decompress_ptr cinfo, long num_bytes)
{
struct jpeg_source_mgr *src = cinfo->src;
@@ -169,15 +169,15 @@ skip_input_data (j_decompress_ptr cinfo, long num_bytes)
* any trouble anyway --- large skips are infrequent.
*/
if (num_bytes > 0) {
- while (num_bytes > (long) src->bytes_in_buffer) {
- num_bytes -= (long) src->bytes_in_buffer;
- (void) (*src->fill_input_buffer) (cinfo);
+ while (num_bytes > (long)src->bytes_in_buffer) {
+ num_bytes -= (long)src->bytes_in_buffer;
+ (void)(*src->fill_input_buffer) (cinfo);
/* note we assume that fill_input_buffer will never return FALSE,
* so suspension need not be handled.
*/
}
- src->next_input_byte += (size_t) num_bytes;
- src->bytes_in_buffer -= (size_t) num_bytes;
+ src->next_input_byte += (size_t)num_bytes;
+ src->bytes_in_buffer -= (size_t)num_bytes;
}
}
@@ -201,7 +201,7 @@ skip_input_data (j_decompress_ptr cinfo, long num_bytes)
*/
METHODDEF(void)
-term_source (j_decompress_ptr cinfo)
+term_source(j_decompress_ptr cinfo)
{
/* no work necessary here */
}
@@ -214,7 +214,7 @@ term_source (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
+jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile)
{
my_src_ptr src;
@@ -225,11 +225,11 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
*/
if (cinfo->src == NULL) { /* first time for this JPEG object? */
cinfo->src = (struct jpeg_source_mgr *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
sizeof(my_source_mgr));
- src = (my_src_ptr) cinfo->src;
+ src = (my_src_ptr)cinfo->src;
src->buffer = (JOCTET *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
INPUT_BUF_SIZE * sizeof(JOCTET));
} else if (cinfo->src->init_source != init_source) {
/* It is unsafe to reuse the existing source manager unless it was created
@@ -241,7 +241,7 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
ERREXIT(cinfo, JERR_BUFFER_SIZE);
}
- src = (my_src_ptr) cinfo->src;
+ src = (my_src_ptr)cinfo->src;
src->pub.init_source = init_source;
src->pub.fill_input_buffer = fill_input_buffer;
src->pub.skip_input_data = skip_input_data;
@@ -260,8 +260,8 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
*/
GLOBAL(void)
-jpeg_mem_src (j_decompress_ptr cinfo,
- const unsigned char *inbuffer, unsigned long insize)
+jpeg_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+ unsigned long insize)
{
struct jpeg_source_mgr *src;
@@ -274,7 +274,7 @@ jpeg_mem_src (j_decompress_ptr cinfo,
*/
if (cinfo->src == NULL) { /* first time for this JPEG object? */
cinfo->src = (struct jpeg_source_mgr *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
sizeof(struct jpeg_source_mgr));
} else if (cinfo->src->init_source != init_mem_source) {
/* It is unsafe to reuse the existing source manager unless it was created
@@ -289,7 +289,7 @@ jpeg_mem_src (j_decompress_ptr cinfo,
src->skip_input_data = skip_input_data;
src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
src->term_source = term_source;
- src->bytes_in_buffer = (size_t) insize;
- src->next_input_byte = (const JOCTET *) inbuffer;
+ src->bytes_in_buffer = (size_t)insize;
+ src->next_input_byte = (const JOCTET *)inbuffer;
}
#endif
diff --git a/media/libjpeg/jdcoefct.c b/media/libjpeg/jdcoefct.c
index 1a48969b83..15e6cded62 100644
--- a/media/libjpeg/jdcoefct.c
+++ b/media/libjpeg/jdcoefct.c
@@ -5,8 +5,8 @@
* Copyright (C) 1994-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
- * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2010, 2015-2016, 2019-2020, D. R. Commander.
+ * Copyright (C) 2015, 2020, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -25,16 +25,15 @@
/* Forward declarations */
-METHODDEF(int) decompress_onepass
- (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+METHODDEF(int) decompress_onepass(j_decompress_ptr cinfo,
+ JSAMPIMAGE output_buf);
#ifdef D_MULTISCAN_FILES_SUPPORTED
-METHODDEF(int) decompress_data
- (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+METHODDEF(int) decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
#endif
#ifdef BLOCK_SMOOTHING_SUPPORTED
-LOCAL(boolean) smoothing_ok (j_decompress_ptr cinfo);
-METHODDEF(int) decompress_smooth_data
- (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+LOCAL(boolean) smoothing_ok(j_decompress_ptr cinfo);
+METHODDEF(int) decompress_smooth_data(j_decompress_ptr cinfo,
+ JSAMPIMAGE output_buf);
#endif
@@ -43,7 +42,7 @@ METHODDEF(int) decompress_smooth_data
*/
METHODDEF(void)
-start_input_pass (j_decompress_ptr cinfo)
+start_input_pass(j_decompress_ptr cinfo)
{
cinfo->input_iMCU_row = 0;
start_iMCU_row(cinfo);
@@ -55,10 +54,10 @@ start_input_pass (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-start_output_pass (j_decompress_ptr cinfo)
+start_output_pass(j_decompress_ptr cinfo)
{
#ifdef BLOCK_SMOOTHING_SUPPORTED
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
/* If multipass, check to see whether to use block smoothing on this pass */
if (coef->pub.coef_arrays != NULL) {
@@ -83,9 +82,9 @@ start_output_pass (j_decompress_ptr cinfo)
*/
METHODDEF(int)
-decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_onepass(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
JDIMENSION MCU_col_num; /* index of current MCU within row */
JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -101,9 +100,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
MCU_col_num++) {
/* Try to fetch an MCU. Entropy decoder expects buffer to be zeroed. */
- jzero_far((void *) coef->MCU_buffer[0],
- (size_t) (cinfo->blocks_in_MCU * sizeof(JBLOCK)));
- if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+ jzero_far((void *)coef->MCU_buffer[0],
+ (size_t)(cinfo->blocks_in_MCU * sizeof(JBLOCK)));
+ if (!cinfo->entropy->insufficient_data)
+ cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
+ if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
/* Suspension forced; update state counters and exit */
coef->MCU_vert_offset = yoffset;
coef->MCU_ctr = MCU_col_num;
@@ -120,28 +121,28 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
* incremented past them!). Note the inner loop relies on having
* allocated the MCU_buffer[] blocks sequentially.
*/
- blkn = 0; /* index of current DCT block within MCU */
+ blkn = 0; /* index of current DCT block within MCU */
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
/* Don't bother to IDCT an uninteresting component. */
- if (! compptr->component_needed) {
+ if (!compptr->component_needed) {
blkn += compptr->MCU_blocks;
continue;
}
inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
- useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
- : compptr->last_col_width;
+ useful_width = (MCU_col_num < last_MCU_col) ?
+ compptr->MCU_width : compptr->last_col_width;
output_ptr = output_buf[compptr->component_index] +
- yoffset * compptr->_DCT_scaled_size;
+ yoffset * compptr->_DCT_scaled_size;
start_col = (MCU_col_num - cinfo->master->first_iMCU_col) *
- compptr->MCU_sample_width;
+ compptr->MCU_sample_width;
for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
if (cinfo->input_iMCU_row < last_iMCU_row ||
- yoffset+yindex < compptr->last_row_height) {
+ yoffset + yindex < compptr->last_row_height) {
output_col = start_col;
for (xindex = 0; xindex < useful_width; xindex++) {
(*inverse_DCT) (cinfo, compptr,
- (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
+ (JCOEFPTR)coef->MCU_buffer[blkn + xindex],
output_ptr, output_col);
output_col += compptr->_DCT_scaled_size;
}
@@ -172,7 +173,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
*/
METHODDEF(int)
-dummy_consume_data (j_decompress_ptr cinfo)
+dummy_consume_data(j_decompress_ptr cinfo)
{
return JPEG_SUSPENDED; /* Always indicate nothing was done */
}
@@ -188,9 +189,9 @@ dummy_consume_data (j_decompress_ptr cinfo)
*/
METHODDEF(int)
-consume_data (j_decompress_ptr cinfo)
+consume_data(j_decompress_ptr cinfo)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
JDIMENSION MCU_col_num; /* index of current MCU within row */
int blkn, ci, xindex, yindex, yoffset;
JDIMENSION start_col;
@@ -202,9 +203,9 @@ consume_data (j_decompress_ptr cinfo)
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
buffer[ci] = (*cinfo->mem->access_virt_barray)
- ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+ ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
cinfo->input_iMCU_row * compptr->v_samp_factor,
- (JDIMENSION) compptr->v_samp_factor, TRUE);
+ (JDIMENSION)compptr->v_samp_factor, TRUE);
/* Note: entropy decoder expects buffer to be zeroed,
* but this is handled automatically by the memory manager
* because we requested a pre-zeroed array.
@@ -222,14 +223,16 @@ consume_data (j_decompress_ptr cinfo)
compptr = cinfo->cur_comp_info[ci];
start_col = MCU_col_num * compptr->MCU_width;
for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
- buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+ buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
coef->MCU_buffer[blkn++] = buffer_ptr++;
}
}
}
+ if (!cinfo->entropy->insufficient_data)
+ cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
/* Try to fetch the MCU. */
- if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+ if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
/* Suspension forced; update state counters and exit */
coef->MCU_vert_offset = yoffset;
coef->MCU_ctr = MCU_col_num;
@@ -259,9 +262,9 @@ consume_data (j_decompress_ptr cinfo)
*/
METHODDEF(int)
-decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
JDIMENSION block_num;
int ci, block_row, block_rows;
@@ -276,7 +279,7 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
while (cinfo->input_scan_number < cinfo->output_scan_number ||
(cinfo->input_scan_number == cinfo->output_scan_number &&
cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) {
- if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
+ if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
return JPEG_SUSPENDED;
}
@@ -284,19 +287,19 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
/* Don't bother to IDCT an uninteresting component. */
- if (! compptr->component_needed)
+ if (!compptr->component_needed)
continue;
/* Align the virtual buffer for this component. */
buffer = (*cinfo->mem->access_virt_barray)
- ((j_common_ptr) cinfo, coef->whole_image[ci],
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
cinfo->output_iMCU_row * compptr->v_samp_factor,
- (JDIMENSION) compptr->v_samp_factor, FALSE);
+ (JDIMENSION)compptr->v_samp_factor, FALSE);
/* Count non-dummy DCT block rows in this iMCU row. */
if (cinfo->output_iMCU_row < last_iMCU_row)
block_rows = compptr->v_samp_factor;
else {
/* NB: can't use last_row_height here; it is input-side-dependent! */
- block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+ block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
if (block_rows == 0) block_rows = compptr->v_samp_factor;
}
inverse_DCT = cinfo->idct->inverse_DCT[ci];
@@ -307,8 +310,8 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
output_col = 0;
for (block_num = cinfo->master->first_MCU_col[ci];
block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
- (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
- output_ptr, output_col);
+ (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)buffer_ptr, output_ptr,
+ output_col);
buffer_ptr++;
output_col += compptr->_DCT_scaled_size;
}
@@ -327,19 +330,22 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
#ifdef BLOCK_SMOOTHING_SUPPORTED
/*
- * This code applies interblock smoothing as described by section K.8
- * of the JPEG standard: the first 5 AC coefficients are estimated from
- * the DC values of a DCT block and its 8 neighboring blocks.
+ * This code applies interblock smoothing; the first 9 AC coefficients are
+ * estimated from the DC values of a DCT block and its 24 neighboring blocks.
* We apply smoothing only for progressive JPEG decoding, and only if
* the coefficients it can estimate are not yet known to full precision.
*/
-/* Natural-order array positions of the first 5 zigzag-order coefficients */
+/* Natural-order array positions of the first 9 zigzag-order coefficients */
#define Q01_POS 1
#define Q10_POS 8
#define Q20_POS 16
#define Q11_POS 9
#define Q02_POS 2
+#define Q03_POS 3
+#define Q12_POS 10
+#define Q21_POS 17
+#define Q30_POS 24
/*
* Determine whether block smoothing is applicable and safe.
@@ -350,51 +356,64 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
*/
LOCAL(boolean)
-smoothing_ok (j_decompress_ptr cinfo)
+smoothing_ok(j_decompress_ptr cinfo)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
boolean smoothing_useful = FALSE;
int ci, coefi;
jpeg_component_info *compptr;
JQUANT_TBL *qtable;
- int *coef_bits;
- int *coef_bits_latch;
+ int *coef_bits, *prev_coef_bits;
+ int *coef_bits_latch, *prev_coef_bits_latch;
- if (! cinfo->progressive_mode || cinfo->coef_bits == NULL)
+ if (!cinfo->progressive_mode || cinfo->coef_bits == NULL)
return FALSE;
/* Allocate latch area if not already done */
if (coef->coef_bits_latch == NULL)
coef->coef_bits_latch = (int *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- cinfo->num_components *
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ cinfo->num_components * 2 *
(SAVED_COEFS * sizeof(int)));
coef_bits_latch = coef->coef_bits_latch;
+ prev_coef_bits_latch =
+ &coef->coef_bits_latch[cinfo->num_components * SAVED_COEFS];
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
/* All components' quantization values must already be latched. */
if ((qtable = compptr->quant_table) == NULL)
return FALSE;
- /* Verify DC & first 5 AC quantizers are nonzero to avoid zero-divide. */
+ /* Verify DC & first 9 AC quantizers are nonzero to avoid zero-divide. */
if (qtable->quantval[0] == 0 ||
qtable->quantval[Q01_POS] == 0 ||
qtable->quantval[Q10_POS] == 0 ||
qtable->quantval[Q20_POS] == 0 ||
qtable->quantval[Q11_POS] == 0 ||
- qtable->quantval[Q02_POS] == 0)
+ qtable->quantval[Q02_POS] == 0 ||
+ qtable->quantval[Q03_POS] == 0 ||
+ qtable->quantval[Q12_POS] == 0 ||
+ qtable->quantval[Q21_POS] == 0 ||
+ qtable->quantval[Q30_POS] == 0)
return FALSE;
/* DC values must be at least partly known for all components. */
coef_bits = cinfo->coef_bits[ci];
+ prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
if (coef_bits[0] < 0)
return FALSE;
+ coef_bits_latch[0] = coef_bits[0];
/* Block smoothing is helpful if some AC coefficients remain inaccurate. */
- for (coefi = 1; coefi <= 5; coefi++) {
+ for (coefi = 1; coefi < SAVED_COEFS; coefi++) {
+ if (cinfo->input_scan_number > 1)
+ prev_coef_bits_latch[coefi] = prev_coef_bits[coefi];
+ else
+ prev_coef_bits_latch[coefi] = -1;
coef_bits_latch[coefi] = coef_bits[coefi];
if (coef_bits[coefi] != 0)
smoothing_useful = TRUE;
}
coef_bits_latch += SAVED_COEFS;
+ prev_coef_bits_latch += SAVED_COEFS;
}
return smoothing_useful;
@@ -406,24 +425,27 @@ smoothing_ok (j_decompress_ptr cinfo)
*/
METHODDEF(int)
-decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
JDIMENSION block_num, last_block_column;
int ci, block_row, block_rows, access_rows;
JBLOCKARRAY buffer;
- JBLOCKROW buffer_ptr, prev_block_row, next_block_row;
+ JBLOCKROW buffer_ptr, prev_prev_block_row, prev_block_row;
+ JBLOCKROW next_block_row, next_next_block_row;
JSAMPARRAY output_ptr;
JDIMENSION output_col;
jpeg_component_info *compptr;
inverse_DCT_method_ptr inverse_DCT;
- boolean first_row, last_row;
+ boolean change_dc;
JCOEF *workspace;
int *coef_bits;
JQUANT_TBL *quanttbl;
- JLONG Q00,Q01,Q02,Q10,Q11,Q20, num;
- int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
+ JLONG Q00, Q01, Q02, Q03 = 0, Q10, Q11, Q12 = 0, Q20, Q21 = 0, Q30 = 0, num;
+ int DC01, DC02, DC03, DC04, DC05, DC06, DC07, DC08, DC09, DC10, DC11, DC12,
+ DC13, DC14, DC15, DC16, DC17, DC18, DC19, DC20, DC21, DC22, DC23, DC24,
+ DC25;
int Al, pred;
/* Keep a local variable to avoid looking it up more than once */
@@ -431,18 +453,18 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
/* Force some input to be done if we are getting ahead of the input. */
while (cinfo->input_scan_number <= cinfo->output_scan_number &&
- ! cinfo->inputctl->eoi_reached) {
+ !cinfo->inputctl->eoi_reached) {
if (cinfo->input_scan_number == cinfo->output_scan_number) {
/* If input is working on current scan, we ordinarily want it to
* have completed the current row. But if input scan is DC,
- * we want it to keep one row ahead so that next block row's DC
+ * we want it to keep two rows ahead so that next two block rows' DC
* values are up to date.
*/
- JDIMENSION delta = (cinfo->Ss == 0) ? 1 : 0;
- if (cinfo->input_iMCU_row > cinfo->output_iMCU_row+delta)
+ JDIMENSION delta = (cinfo->Ss == 0) ? 2 : 0;
+ if (cinfo->input_iMCU_row > cinfo->output_iMCU_row + delta)
break;
}
- if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
+ if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
return JPEG_SUSPENDED;
}
@@ -450,37 +472,56 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
/* Don't bother to IDCT an uninteresting component. */
- if (! compptr->component_needed)
+ if (!compptr->component_needed)
continue;
/* Count non-dummy DCT block rows in this iMCU row. */
- if (cinfo->output_iMCU_row < last_iMCU_row) {
+ if (cinfo->output_iMCU_row < last_iMCU_row - 1) {
+ block_rows = compptr->v_samp_factor;
+ access_rows = block_rows * 3; /* this and next two iMCU rows */
+ } else if (cinfo->output_iMCU_row < last_iMCU_row) {
block_rows = compptr->v_samp_factor;
access_rows = block_rows * 2; /* this and next iMCU row */
- last_row = FALSE;
} else {
/* NB: can't use last_row_height here; it is input-side-dependent! */
- block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+ block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
if (block_rows == 0) block_rows = compptr->v_samp_factor;
access_rows = block_rows; /* this iMCU row only */
- last_row = TRUE;
}
/* Align the virtual buffer for this component. */
- if (cinfo->output_iMCU_row > 0) {
- access_rows += compptr->v_samp_factor; /* prior iMCU row too */
+ if (cinfo->output_iMCU_row > 1) {
+ access_rows += 2 * compptr->v_samp_factor; /* prior two iMCU rows too */
+ buffer = (*cinfo->mem->access_virt_barray)
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
+ (cinfo->output_iMCU_row - 2) * compptr->v_samp_factor,
+ (JDIMENSION)access_rows, FALSE);
+ buffer += 2 * compptr->v_samp_factor; /* point to current iMCU row */
+ } else if (cinfo->output_iMCU_row > 0) {
buffer = (*cinfo->mem->access_virt_barray)
- ((j_common_ptr) cinfo, coef->whole_image[ci],
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
(cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
- (JDIMENSION) access_rows, FALSE);
+ (JDIMENSION)access_rows, FALSE);
buffer += compptr->v_samp_factor; /* point to current iMCU row */
- first_row = FALSE;
} else {
buffer = (*cinfo->mem->access_virt_barray)
- ((j_common_ptr) cinfo, coef->whole_image[ci],
- (JDIMENSION) 0, (JDIMENSION) access_rows, FALSE);
- first_row = TRUE;
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
+ (JDIMENSION)0, (JDIMENSION)access_rows, FALSE);
}
- /* Fetch component-dependent info */
- coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+ /* Fetch component-dependent info.
+ * If the current scan is incomplete, then we use the component-dependent
+ * info from the previous scan.
+ */
+ if (cinfo->output_iMCU_row > cinfo->master->last_good_iMCU_row)
+ coef_bits =
+ coef->coef_bits_latch + ((ci + cinfo->num_components) * SAVED_COEFS);
+ else
+ coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+
+ /* We only do DC interpolation if no AC coefficient data is available. */
+ change_dc =
+ coef_bits[1] == -1 && coef_bits[2] == -1 && coef_bits[3] == -1 &&
+ coef_bits[4] == -1 && coef_bits[5] == -1 && coef_bits[6] == -1 &&
+ coef_bits[7] == -1 && coef_bits[8] == -1 && coef_bits[9] == -1;
+
quanttbl = compptr->quant_table;
Q00 = quanttbl->quantval[0];
Q01 = quanttbl->quantval[Q01_POS];
@@ -488,124 +529,268 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
Q20 = quanttbl->quantval[Q20_POS];
Q11 = quanttbl->quantval[Q11_POS];
Q02 = quanttbl->quantval[Q02_POS];
+ if (change_dc) {
+ Q03 = quanttbl->quantval[Q03_POS];
+ Q12 = quanttbl->quantval[Q12_POS];
+ Q21 = quanttbl->quantval[Q21_POS];
+ Q30 = quanttbl->quantval[Q30_POS];
+ }
inverse_DCT = cinfo->idct->inverse_DCT[ci];
output_ptr = output_buf[ci];
/* Loop over all DCT blocks to be processed. */
for (block_row = 0; block_row < block_rows; block_row++) {
buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
- if (first_row && block_row == 0)
+
+ if (block_row > 0 || cinfo->output_iMCU_row > 0)
+ prev_block_row =
+ buffer[block_row - 1] + cinfo->master->first_MCU_col[ci];
+ else
prev_block_row = buffer_ptr;
+
+ if (block_row > 1 || cinfo->output_iMCU_row > 1)
+ prev_prev_block_row =
+ buffer[block_row - 2] + cinfo->master->first_MCU_col[ci];
+ else
+ prev_prev_block_row = prev_block_row;
+
+ if (block_row < block_rows - 1 || cinfo->output_iMCU_row < last_iMCU_row)
+ next_block_row =
+ buffer[block_row + 1] + cinfo->master->first_MCU_col[ci];
else
- prev_block_row = buffer[block_row-1];
- if (last_row && block_row == block_rows-1)
next_block_row = buffer_ptr;
+
+ if (block_row < block_rows - 2 ||
+ cinfo->output_iMCU_row < last_iMCU_row - 1)
+ next_next_block_row =
+ buffer[block_row + 2] + cinfo->master->first_MCU_col[ci];
else
- next_block_row = buffer[block_row+1];
+ next_next_block_row = next_block_row;
+
/* We fetch the surrounding DC values using a sliding-register approach.
- * Initialize all nine here so as to do the right thing on narrow pics.
+ * Initialize all 25 here so as to do the right thing on narrow pics.
*/
- DC1 = DC2 = DC3 = (int) prev_block_row[0][0];
- DC4 = DC5 = DC6 = (int) buffer_ptr[0][0];
- DC7 = DC8 = DC9 = (int) next_block_row[0][0];
+ DC01 = DC02 = DC03 = DC04 = DC05 = (int)prev_prev_block_row[0][0];
+ DC06 = DC07 = DC08 = DC09 = DC10 = (int)prev_block_row[0][0];
+ DC11 = DC12 = DC13 = DC14 = DC15 = (int)buffer_ptr[0][0];
+ DC16 = DC17 = DC18 = DC19 = DC20 = (int)next_block_row[0][0];
+ DC21 = DC22 = DC23 = DC24 = DC25 = (int)next_next_block_row[0][0];
output_col = 0;
last_block_column = compptr->width_in_blocks - 1;
for (block_num = cinfo->master->first_MCU_col[ci];
block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
/* Fetch current DCT block into workspace so we can modify it. */
- jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1);
+ jcopy_block_row(buffer_ptr, (JBLOCKROW)workspace, (JDIMENSION)1);
/* Update DC values */
- if (block_num < last_block_column) {
- DC3 = (int) prev_block_row[1][0];
- DC6 = (int) buffer_ptr[1][0];
- DC9 = (int) next_block_row[1][0];
+ if (block_num == cinfo->master->first_MCU_col[ci] &&
+ block_num < last_block_column) {
+ DC04 = (int)prev_prev_block_row[1][0];
+ DC09 = (int)prev_block_row[1][0];
+ DC14 = (int)buffer_ptr[1][0];
+ DC19 = (int)next_block_row[1][0];
+ DC24 = (int)next_next_block_row[1][0];
+ }
+ if (block_num + 1 < last_block_column) {
+ DC05 = (int)prev_prev_block_row[2][0];
+ DC10 = (int)prev_block_row[2][0];
+ DC15 = (int)buffer_ptr[2][0];
+ DC20 = (int)next_block_row[2][0];
+ DC25 = (int)next_next_block_row[2][0];
}
- /* Compute coefficient estimates per K.8.
- * An estimate is applied only if coefficient is still zero,
- * and is not known to be fully accurate.
+ /* If DC interpolation is enabled, compute coefficient estimates using
+ * a Gaussian-like kernel, keeping the averages of the DC values.
+ *
+ * If DC interpolation is disabled, compute coefficient estimates using
+ * an algorithm similar to the one described in Section K.8 of the JPEG
+ * standard, except applied to a 5x5 window rather than a 3x3 window.
+ *
+ * An estimate is applied only if the coefficient is still zero and is
+ * not known to be fully accurate.
*/
/* AC01 */
- if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) {
- num = 36 * Q00 * (DC4 - DC6);
+ if ((Al = coef_bits[1]) != 0 && workspace[1] == 0) {
+ num = Q00 * (change_dc ?
+ (-DC01 - DC02 + DC04 + DC05 - 3 * DC06 + 13 * DC07 -
+ 13 * DC09 + 3 * DC10 - 3 * DC11 + 38 * DC12 - 38 * DC14 +
+ 3 * DC15 - 3 * DC16 + 13 * DC17 - 13 * DC19 + 3 * DC20 -
+ DC21 - DC22 + DC24 + DC25) :
+ (-7 * DC11 + 50 * DC12 - 50 * DC14 + 7 * DC15));
if (num >= 0) {
- pred = (int) (((Q01<<7) + num) / (Q01<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q01 << 7) + num) / (Q01 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
} else {
- pred = (int) (((Q01<<7) - num) / (Q01<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q01 << 7) - num) / (Q01 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
pred = -pred;
}
- workspace[1] = (JCOEF) pred;
+ workspace[1] = (JCOEF)pred;
}
/* AC10 */
- if ((Al=coef_bits[2]) != 0 && workspace[8] == 0) {
- num = 36 * Q00 * (DC2 - DC8);
+ if ((Al = coef_bits[2]) != 0 && workspace[8] == 0) {
+ num = Q00 * (change_dc ?
+ (-DC01 - 3 * DC02 - 3 * DC03 - 3 * DC04 - DC05 - DC06 +
+ 13 * DC07 + 38 * DC08 + 13 * DC09 - DC10 + DC16 -
+ 13 * DC17 - 38 * DC18 - 13 * DC19 + DC20 + DC21 +
+ 3 * DC22 + 3 * DC23 + 3 * DC24 + DC25) :
+ (-7 * DC03 + 50 * DC08 - 50 * DC18 + 7 * DC23));
if (num >= 0) {
- pred = (int) (((Q10<<7) + num) / (Q10<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q10 << 7) + num) / (Q10 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
} else {
- pred = (int) (((Q10<<7) - num) / (Q10<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q10 << 7) - num) / (Q10 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
pred = -pred;
}
- workspace[8] = (JCOEF) pred;
+ workspace[8] = (JCOEF)pred;
}
/* AC20 */
- if ((Al=coef_bits[3]) != 0 && workspace[16] == 0) {
- num = 9 * Q00 * (DC2 + DC8 - 2*DC5);
+ if ((Al = coef_bits[3]) != 0 && workspace[16] == 0) {
+ num = Q00 * (change_dc ?
+ (DC03 + 2 * DC07 + 7 * DC08 + 2 * DC09 - 5 * DC12 - 14 * DC13 -
+ 5 * DC14 + 2 * DC17 + 7 * DC18 + 2 * DC19 + DC23) :
+ (-DC03 + 13 * DC08 - 24 * DC13 + 13 * DC18 - DC23));
if (num >= 0) {
- pred = (int) (((Q20<<7) + num) / (Q20<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q20 << 7) + num) / (Q20 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
} else {
- pred = (int) (((Q20<<7) - num) / (Q20<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q20 << 7) - num) / (Q20 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
pred = -pred;
}
- workspace[16] = (JCOEF) pred;
+ workspace[16] = (JCOEF)pred;
}
/* AC11 */
- if ((Al=coef_bits[4]) != 0 && workspace[9] == 0) {
- num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
+ if ((Al = coef_bits[4]) != 0 && workspace[9] == 0) {
+ num = Q00 * (change_dc ?
+ (-DC01 + DC05 + 9 * DC07 - 9 * DC09 - 9 * DC17 +
+ 9 * DC19 + DC21 - DC25) :
+ (DC10 + DC16 - 10 * DC17 + 10 * DC19 - DC02 - DC20 + DC22 -
+ DC24 + DC04 - DC06 + 10 * DC07 - 10 * DC09));
if (num >= 0) {
- pred = (int) (((Q11<<7) + num) / (Q11<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q11 << 7) + num) / (Q11 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
} else {
- pred = (int) (((Q11<<7) - num) / (Q11<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q11 << 7) - num) / (Q11 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
pred = -pred;
}
- workspace[9] = (JCOEF) pred;
+ workspace[9] = (JCOEF)pred;
}
/* AC02 */
- if ((Al=coef_bits[5]) != 0 && workspace[2] == 0) {
- num = 9 * Q00 * (DC4 + DC6 - 2*DC5);
+ if ((Al = coef_bits[5]) != 0 && workspace[2] == 0) {
+ num = Q00 * (change_dc ?
+ (2 * DC07 - 5 * DC08 + 2 * DC09 + DC11 + 7 * DC12 - 14 * DC13 +
+ 7 * DC14 + DC15 + 2 * DC17 - 5 * DC18 + 2 * DC19) :
+ (-DC11 + 13 * DC12 - 24 * DC13 + 13 * DC14 - DC15));
if (num >= 0) {
- pred = (int) (((Q02<<7) + num) / (Q02<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q02 << 7) + num) / (Q02 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
} else {
- pred = (int) (((Q02<<7) - num) / (Q02<<8));
- if (Al > 0 && pred >= (1<<Al))
- pred = (1<<Al)-1;
+ pred = (int)(((Q02 << 7) - num) / (Q02 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
pred = -pred;
}
- workspace[2] = (JCOEF) pred;
+ workspace[2] = (JCOEF)pred;
}
+ if (change_dc) {
+ /* AC03 */
+ if ((Al = coef_bits[6]) != 0 && workspace[3] == 0) {
+ num = Q00 * (DC07 - DC09 + 2 * DC12 - 2 * DC14 + DC17 - DC19);
+ if (num >= 0) {
+ pred = (int)(((Q03 << 7) + num) / (Q03 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q03 << 7) - num) / (Q03 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[3] = (JCOEF)pred;
+ }
+ /* AC12 */
+ if ((Al = coef_bits[7]) != 0 && workspace[10] == 0) {
+ num = Q00 * (DC07 - 3 * DC08 + DC09 - DC17 + 3 * DC18 - DC19);
+ if (num >= 0) {
+ pred = (int)(((Q12 << 7) + num) / (Q12 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q12 << 7) - num) / (Q12 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[10] = (JCOEF)pred;
+ }
+ /* AC21 */
+ if ((Al = coef_bits[8]) != 0 && workspace[17] == 0) {
+ num = Q00 * (DC07 - DC09 - 3 * DC12 + 3 * DC14 + DC17 - DC19);
+ if (num >= 0) {
+ pred = (int)(((Q21 << 7) + num) / (Q21 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q21 << 7) - num) / (Q21 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[17] = (JCOEF)pred;
+ }
+ /* AC30 */
+ if ((Al = coef_bits[9]) != 0 && workspace[24] == 0) {
+ num = Q00 * (DC07 + 2 * DC08 + DC09 - DC17 - 2 * DC18 - DC19);
+ if (num >= 0) {
+ pred = (int)(((Q30 << 7) + num) / (Q30 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q30 << 7) - num) / (Q30 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[24] = (JCOEF)pred;
+ }
+ /* coef_bits[0] is non-negative. Otherwise this function would not
+ * be called.
+ */
+ num = Q00 *
+ (-2 * DC01 - 6 * DC02 - 8 * DC03 - 6 * DC04 - 2 * DC05 -
+ 6 * DC06 + 6 * DC07 + 42 * DC08 + 6 * DC09 - 6 * DC10 -
+ 8 * DC11 + 42 * DC12 + 152 * DC13 + 42 * DC14 - 8 * DC15 -
+ 6 * DC16 + 6 * DC17 + 42 * DC18 + 6 * DC19 - 6 * DC20 -
+ 2 * DC21 - 6 * DC22 - 8 * DC23 - 6 * DC24 - 2 * DC25);
+ if (num >= 0) {
+ pred = (int)(((Q00 << 7) + num) / (Q00 << 8));
+ } else {
+ pred = (int)(((Q00 << 7) - num) / (Q00 << 8));
+ pred = -pred;
+ }
+ workspace[0] = (JCOEF)pred;
+ } /* change_dc */
+
/* OK, do the IDCT */
- (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workspace,
- output_ptr, output_col);
+ (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)workspace, output_ptr,
+ output_col);
/* Advance for next column */
- DC1 = DC2; DC2 = DC3;
- DC4 = DC5; DC5 = DC6;
- DC7 = DC8; DC8 = DC9;
- buffer_ptr++, prev_block_row++, next_block_row++;
+ DC01 = DC02; DC02 = DC03; DC03 = DC04; DC04 = DC05;
+ DC06 = DC07; DC07 = DC08; DC08 = DC09; DC09 = DC10;
+ DC11 = DC12; DC12 = DC13; DC13 = DC14; DC14 = DC15;
+ DC16 = DC17; DC17 = DC18; DC18 = DC19; DC19 = DC20;
+ DC21 = DC22; DC22 = DC23; DC23 = DC24; DC24 = DC25;
+ buffer_ptr++, prev_block_row++, next_block_row++,
+ prev_prev_block_row++, next_next_block_row++;
output_col += compptr->_DCT_scaled_size;
}
output_ptr += compptr->_DCT_scaled_size;
@@ -625,14 +810,14 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
*/
GLOBAL(void)
-jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
{
my_coef_ptr coef;
coef = (my_coef_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_coef_controller));
- cinfo->coef = (struct jpeg_d_coef_controller *) coef;
+ cinfo->coef = (struct jpeg_d_coef_controller *)coef;
coef->pub.start_input_pass = start_input_pass;
coef->pub.start_output_pass = start_output_pass;
#ifdef BLOCK_SMOOTHING_SUPPORTED
@@ -654,15 +839,15 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
#ifdef BLOCK_SMOOTHING_SUPPORTED
/* If block smoothing could be used, need a bigger window */
if (cinfo->progressive_mode)
- access_rows *= 3;
+ access_rows *= 5;
#endif
coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, TRUE,
- (JDIMENSION) jround_up((long) compptr->width_in_blocks,
- (long) compptr->h_samp_factor),
- (JDIMENSION) jround_up((long) compptr->height_in_blocks,
- (long) compptr->v_samp_factor),
- (JDIMENSION) access_rows);
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, TRUE,
+ (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+ (long)compptr->h_samp_factor),
+ (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+ (long)compptr->v_samp_factor),
+ (JDIMENSION)access_rows);
}
coef->pub.consume_data = consume_data;
coef->pub.decompress_data = decompress_data;
@@ -676,7 +861,7 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
int i;
buffer = (JBLOCKROW)
- (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
D_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
coef->MCU_buffer[i] = buffer + i;
@@ -688,6 +873,6 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
/* Allocate the workspace buffer */
coef->workspace = (JCOEF *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(JCOEF) * DCTSIZE2);
}
diff --git a/media/libjpeg/jdcoefct.h b/media/libjpeg/jdcoefct.h
index bf6beb274b..9a0e780663 100644
--- a/media/libjpeg/jdcoefct.h
+++ b/media/libjpeg/jdcoefct.h
@@ -5,6 +5,7 @@
* Copyright (C) 1994-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2020, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*/
@@ -51,7 +52,7 @@ typedef struct {
#ifdef BLOCK_SMOOTHING_SUPPORTED
/* When doing block smoothing, we latch coefficient Al values here */
int *coef_bits_latch;
-#define SAVED_COEFS 6 /* we save coef_bits[0..5] */
+#define SAVED_COEFS 10 /* we save coef_bits[0..9] */
#endif
} my_coef_controller;
@@ -59,10 +60,10 @@ typedef my_coef_controller *my_coef_ptr;
LOCAL(void)
-start_iMCU_row (j_decompress_ptr cinfo)
+start_iMCU_row(j_decompress_ptr cinfo)
/* Reset within-iMCU-row counters for a new row (input side) */
{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
/* In an interleaved scan, an MCU row is the same as an iMCU row.
* In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -71,7 +72,7 @@ start_iMCU_row (j_decompress_ptr cinfo)
if (cinfo->comps_in_scan > 1) {
coef->MCU_rows_per_iMCU_row = 1;
} else {
- if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1))
+ if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows - 1))
coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
else
coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
diff --git a/media/libjpeg/jdcol565.c b/media/libjpeg/jdcol565.c
index 349fce4a66..53c7bd9187 100644
--- a/media/libjpeg/jdcol565.c
+++ b/media/libjpeg/jdcol565.c
@@ -17,22 +17,22 @@
INLINE
LOCAL(void)
-ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
register int y, cb, cr;
register JSAMPROW outptr;
register JSAMPROW inptr0, inptr1, inptr2;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->output_width;
/* copy these pointers into registers if possible */
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
- register int * Crrtab = cconvert->Cr_r_tab;
- register int * Cbbtab = cconvert->Cb_b_tab;
- register JLONG * Crgtab = cconvert->Cr_g_tab;
- register JLONG * Cbgtab = cconvert->Cb_g_tab;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ register int *Crrtab = cconvert->Cr_r_tab;
+ register int *Cbbtab = cconvert->Cb_b_tab;
+ register JLONG *Crgtab = cconvert->Cr_g_tab;
+ register JLONG *Cbgtab = cconvert->Cb_g_tab;
SHIFT_TEMPS
while (--num_rows >= 0) {
@@ -45,31 +45,31 @@ ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
outptr = *output_buf++;
if (PACK_NEED_ALIGNMENT(outptr)) {
- y = GETJSAMPLE(*inptr0++);
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
r = range_limit[y + Crrtab[cr]];
g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
SCALEBITS))];
b = range_limit[y + Cbbtab[cb]];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
outptr += 2;
num_cols--;
}
for (col = 0; col < (num_cols >> 1); col++) {
- y = GETJSAMPLE(*inptr0++);
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
r = range_limit[y + Crrtab[cr]];
g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
SCALEBITS))];
b = range_limit[y + Cbbtab[cb]];
rgb = PACK_SHORT_565(r, g, b);
- y = GETJSAMPLE(*inptr0++);
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
r = range_limit[y + Crrtab[cr]];
g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
SCALEBITS))];
@@ -80,15 +80,15 @@ ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
outptr += 4;
}
if (num_cols & 1) {
- y = GETJSAMPLE(*inptr0);
- cb = GETJSAMPLE(*inptr1);
- cr = GETJSAMPLE(*inptr2);
+ y = *inptr0;
+ cb = *inptr1;
+ cr = *inptr2;
r = range_limit[y + Crrtab[cr]];
g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
SCALEBITS))];
b = range_limit[y + Cbbtab[cb]];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
}
}
}
@@ -96,22 +96,22 @@ ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
INLINE
LOCAL(void)
-ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
register int y, cb, cr;
register JSAMPROW outptr;
register JSAMPROW inptr0, inptr1, inptr2;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->output_width;
/* copy these pointers into registers if possible */
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
- register int * Crrtab = cconvert->Cr_r_tab;
- register int * Cbbtab = cconvert->Cb_b_tab;
- register JLONG * Crgtab = cconvert->Cr_g_tab;
- register JLONG * Cbgtab = cconvert->Cb_g_tab;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ register int *Crrtab = cconvert->Cr_r_tab;
+ register int *Cbbtab = cconvert->Cb_b_tab;
+ register JLONG *Crgtab = cconvert->Cr_g_tab;
+ register JLONG *Cbgtab = cconvert->Cb_g_tab;
JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
SHIFT_TEMPS
@@ -125,23 +125,23 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
input_row++;
outptr = *output_buf++;
if (PACK_NEED_ALIGNMENT(outptr)) {
- y = GETJSAMPLE(*inptr0++);
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
g = range_limit[DITHER_565_G(y +
((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
SCALEBITS)), d0)];
b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
outptr += 2;
num_cols--;
}
for (col = 0; col < (num_cols >> 1); col++) {
- y = GETJSAMPLE(*inptr0++);
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
g = range_limit[DITHER_565_G(y +
((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -150,9 +150,9 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
d0 = DITHER_ROTATE(d0);
rgb = PACK_SHORT_565(r, g, b);
- y = GETJSAMPLE(*inptr0++);
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
g = range_limit[DITHER_565_G(y +
((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -165,16 +165,16 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
outptr += 4;
}
if (num_cols & 1) {
- y = GETJSAMPLE(*inptr0);
- cb = GETJSAMPLE(*inptr1);
- cr = GETJSAMPLE(*inptr2);
+ y = *inptr0;
+ cb = *inptr1;
+ cr = *inptr2;
r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
g = range_limit[DITHER_565_G(y +
((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
SCALEBITS)), d0)];
b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
}
}
}
@@ -182,9 +182,9 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
INLINE
LOCAL(void)
-rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
register JSAMPROW outptr;
register JSAMPROW inptr0, inptr1, inptr2;
@@ -202,34 +202,34 @@ rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
input_row++;
outptr = *output_buf++;
if (PACK_NEED_ALIGNMENT(outptr)) {
- r = GETJSAMPLE(*inptr0++);
- g = GETJSAMPLE(*inptr1++);
- b = GETJSAMPLE(*inptr2++);
+ r = *inptr0++;
+ g = *inptr1++;
+ b = *inptr2++;
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
outptr += 2;
num_cols--;
}
for (col = 0; col < (num_cols >> 1); col++) {
- r = GETJSAMPLE(*inptr0++);
- g = GETJSAMPLE(*inptr1++);
- b = GETJSAMPLE(*inptr2++);
+ r = *inptr0++;
+ g = *inptr1++;
+ b = *inptr2++;
rgb = PACK_SHORT_565(r, g, b);
- r = GETJSAMPLE(*inptr0++);
- g = GETJSAMPLE(*inptr1++);
- b = GETJSAMPLE(*inptr2++);
+ r = *inptr0++;
+ g = *inptr1++;
+ b = *inptr2++;
rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
outptr += 4;
}
if (num_cols & 1) {
- r = GETJSAMPLE(*inptr0);
- g = GETJSAMPLE(*inptr1);
- b = GETJSAMPLE(*inptr2);
+ r = *inptr0;
+ g = *inptr1;
+ b = *inptr2;
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
}
}
}
@@ -237,14 +237,14 @@ rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
INLINE
LOCAL(void)
-rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
register JSAMPROW outptr;
register JSAMPROW inptr0, inptr1, inptr2;
register JDIMENSION col;
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
JDIMENSION num_cols = cinfo->output_width;
JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
SHIFT_TEMPS
@@ -259,24 +259,24 @@ rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
input_row++;
outptr = *output_buf++;
if (PACK_NEED_ALIGNMENT(outptr)) {
- r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
- g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
- b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+ r = range_limit[DITHER_565_R(*inptr0++, d0)];
+ g = range_limit[DITHER_565_G(*inptr1++, d0)];
+ b = range_limit[DITHER_565_B(*inptr2++, d0)];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
outptr += 2;
num_cols--;
}
for (col = 0; col < (num_cols >> 1); col++) {
- r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
- g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
- b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+ r = range_limit[DITHER_565_R(*inptr0++, d0)];
+ g = range_limit[DITHER_565_G(*inptr1++, d0)];
+ b = range_limit[DITHER_565_B(*inptr2++, d0)];
d0 = DITHER_ROTATE(d0);
rgb = PACK_SHORT_565(r, g, b);
- r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
- g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
- b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+ r = range_limit[DITHER_565_R(*inptr0++, d0)];
+ g = range_limit[DITHER_565_G(*inptr1++, d0)];
+ b = range_limit[DITHER_565_B(*inptr2++, d0)];
d0 = DITHER_ROTATE(d0);
rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
@@ -284,11 +284,11 @@ rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
outptr += 4;
}
if (num_cols & 1) {
- r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0), d0)];
- g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
- b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
+ r = range_limit[DITHER_565_R(*inptr0, d0)];
+ g = range_limit[DITHER_565_G(*inptr1, d0)];
+ b = range_limit[DITHER_565_B(*inptr2, d0)];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
}
}
}
@@ -296,9 +296,9 @@ rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
INLINE
LOCAL(void)
-gray_rgb565_convert_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+gray_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
register JSAMPROW inptr, outptr;
register JDIMENSION col;
@@ -313,7 +313,7 @@ gray_rgb565_convert_internal (j_decompress_ptr cinfo,
if (PACK_NEED_ALIGNMENT(outptr)) {
g = *inptr++;
rgb = PACK_SHORT_565(g, g, g);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
outptr += 2;
num_cols--;
}
@@ -328,7 +328,7 @@ gray_rgb565_convert_internal (j_decompress_ptr cinfo,
if (num_cols & 1) {
g = *inptr;
rgb = PACK_SHORT_565(g, g, g);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
}
}
}
@@ -336,13 +336,13 @@ gray_rgb565_convert_internal (j_decompress_ptr cinfo,
INLINE
LOCAL(void)
-gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+gray_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
register JSAMPROW inptr, outptr;
register JDIMENSION col;
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
JDIMENSION num_cols = cinfo->output_width;
JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
@@ -356,7 +356,7 @@ gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
g = *inptr++;
g = range_limit[DITHER_565_R(g, d0)];
rgb = PACK_SHORT_565(g, g, g);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
outptr += 2;
num_cols--;
}
@@ -378,7 +378,7 @@ gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
g = *inptr;
g = range_limit[DITHER_565_R(g, d0)];
rgb = PACK_SHORT_565(g, g, g);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
}
}
}
diff --git a/media/libjpeg/jdcolext.c b/media/libjpeg/jdcolext.c
index 59b676cc4d..863c7a2fbc 100644
--- a/media/libjpeg/jdcolext.c
+++ b/media/libjpeg/jdcolext.c
@@ -28,22 +28,22 @@
INLINE
LOCAL(void)
-ycc_rgb_convert_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
register int y, cb, cr;
register JSAMPROW outptr;
register JSAMPROW inptr0, inptr1, inptr2;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->output_width;
/* copy these pointers into registers if possible */
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
- register int * Crrtab = cconvert->Cr_r_tab;
- register int * Cbbtab = cconvert->Cb_b_tab;
- register JLONG * Crgtab = cconvert->Cr_g_tab;
- register JLONG * Cbgtab = cconvert->Cb_g_tab;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ register int *Crrtab = cconvert->Cr_r_tab;
+ register int *Cbbtab = cconvert->Cb_b_tab;
+ register JLONG *Crgtab = cconvert->Cr_g_tab;
+ register JLONG *Cbgtab = cconvert->Cb_g_tab;
SHIFT_TEMPS
while (--num_rows >= 0) {
@@ -53,14 +53,14 @@ ycc_rgb_convert_internal (j_decompress_ptr cinfo,
input_row++;
outptr = *output_buf++;
for (col = 0; col < num_cols; col++) {
- y = GETJSAMPLE(inptr0[col]);
- cb = GETJSAMPLE(inptr1[col]);
- cr = GETJSAMPLE(inptr2[col]);
+ y = inptr0[col];
+ cb = inptr1[col];
+ cr = inptr2[col];
/* Range-limiting is essential due to noise introduced by DCT losses. */
outptr[RGB_RED] = range_limit[y + Crrtab[cr]];
outptr[RGB_GREEN] = range_limit[y +
- ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
- SCALEBITS))];
+ ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS))];
outptr[RGB_BLUE] = range_limit[y + Cbbtab[cb]];
/* Set unused byte to 0xFF so it can be interpreted as an opaque */
/* alpha channel value */
@@ -81,9 +81,9 @@ ycc_rgb_convert_internal (j_decompress_ptr cinfo,
INLINE
LOCAL(void)
-gray_rgb_convert_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
register JSAMPROW inptr, outptr;
register JDIMENSION col;
@@ -93,7 +93,6 @@ gray_rgb_convert_internal (j_decompress_ptr cinfo,
inptr = input_buf[0][input_row++];
outptr = *output_buf++;
for (col = 0; col < num_cols; col++) {
- /* We can dispense with GETJSAMPLE() here */
outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
/* Set unused byte to 0xFF so it can be interpreted as an opaque */
/* alpha channel value */
@@ -112,9 +111,9 @@ gray_rgb_convert_internal (j_decompress_ptr cinfo,
INLINE
LOCAL(void)
-rgb_rgb_convert_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+rgb_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
register JSAMPROW inptr0, inptr1, inptr2;
register JSAMPROW outptr;
@@ -128,7 +127,6 @@ rgb_rgb_convert_internal (j_decompress_ptr cinfo,
input_row++;
outptr = *output_buf++;
for (col = 0; col < num_cols; col++) {
- /* We can dispense with GETJSAMPLE() here */
outptr[RGB_RED] = inptr0[col];
outptr[RGB_GREEN] = inptr1[col];
outptr[RGB_BLUE] = inptr2[col];
diff --git a/media/libjpeg/jdcolor.c b/media/libjpeg/jdcolor.c
index ab8fa24925..8da2b4eaf2 100644
--- a/media/libjpeg/jdcolor.c
+++ b/media/libjpeg/jdcolor.c
@@ -74,8 +74,8 @@ typedef my_color_deconverter *my_cconvert_ptr;
*/
#define SCALEBITS 16 /* speediest right-shift on some machines */
-#define ONE_HALF ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x) ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x) ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
/* We allocate one big table for RGB->Y conversion and divide it up into
* three parts, instead of doing three alloc_small requests. This lets us
@@ -85,9 +85,9 @@ typedef my_color_deconverter *my_cconvert_ptr;
*/
#define R_Y_OFF 0 /* offset to R => Y section */
-#define G_Y_OFF (1*(MAXJSAMPLE+1)) /* offset to G => Y section */
-#define B_Y_OFF (2*(MAXJSAMPLE+1)) /* etc. */
-#define TABLE_SIZE (3*(MAXJSAMPLE+1))
+#define G_Y_OFF (1 * (MAXJSAMPLE + 1)) /* offset to G => Y section */
+#define B_Y_OFF (2 * (MAXJSAMPLE + 1)) /* etc. */
+#define TABLE_SIZE (3 * (MAXJSAMPLE + 1))
/* Include inline routines for colorspace extensions */
@@ -98,13 +98,13 @@ typedef my_color_deconverter *my_cconvert_ptr;
#undef RGB_BLUE
#undef RGB_PIXELSIZE
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extrgb_convert_internal
-#define gray_rgb_convert_internal gray_extrgb_convert_internal
-#define rgb_rgb_convert_internal rgb_extrgb_convert_internal
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extrgb_convert_internal
+#define gray_rgb_convert_internal gray_extrgb_convert_internal
+#define rgb_rgb_convert_internal rgb_extrgb_convert_internal
#include "jdcolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -114,14 +114,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
#undef gray_rgb_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extrgbx_convert_internal
-#define gray_rgb_convert_internal gray_extrgbx_convert_internal
-#define rgb_rgb_convert_internal rgb_extrgbx_convert_internal
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extrgbx_convert_internal
+#define gray_rgb_convert_internal gray_extrgbx_convert_internal
+#define rgb_rgb_convert_internal rgb_extrgbx_convert_internal
#include "jdcolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -132,13 +132,13 @@ typedef my_color_deconverter *my_cconvert_ptr;
#undef gray_rgb_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extbgr_convert_internal
-#define gray_rgb_convert_internal gray_extbgr_convert_internal
-#define rgb_rgb_convert_internal rgb_extbgr_convert_internal
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extbgr_convert_internal
+#define gray_rgb_convert_internal gray_extbgr_convert_internal
+#define rgb_rgb_convert_internal rgb_extbgr_convert_internal
#include "jdcolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -148,14 +148,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
#undef gray_rgb_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extbgrx_convert_internal
-#define gray_rgb_convert_internal gray_extbgrx_convert_internal
-#define rgb_rgb_convert_internal rgb_extbgrx_convert_internal
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extbgrx_convert_internal
+#define gray_rgb_convert_internal gray_extbgrx_convert_internal
+#define rgb_rgb_convert_internal rgb_extbgrx_convert_internal
#include "jdcolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -166,14 +166,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
#undef gray_rgb_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extxbgr_convert_internal
-#define gray_rgb_convert_internal gray_extxbgr_convert_internal
-#define rgb_rgb_convert_internal rgb_extxbgr_convert_internal
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extxbgr_convert_internal
+#define gray_rgb_convert_internal gray_extxbgr_convert_internal
+#define rgb_rgb_convert_internal rgb_extxbgr_convert_internal
#include "jdcolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -184,14 +184,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
#undef gray_rgb_convert_internal
#undef rgb_rgb_convert_internal
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extxrgb_convert_internal
-#define gray_rgb_convert_internal gray_extxrgb_convert_internal
-#define rgb_rgb_convert_internal rgb_extxrgb_convert_internal
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extxrgb_convert_internal
+#define gray_rgb_convert_internal gray_extxrgb_convert_internal
+#define rgb_rgb_convert_internal rgb_extxrgb_convert_internal
#include "jdcolext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -208,25 +208,25 @@ typedef my_color_deconverter *my_cconvert_ptr;
*/
LOCAL(void)
-build_ycc_rgb_table (j_decompress_ptr cinfo)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
int i;
JLONG x;
SHIFT_TEMPS
cconvert->Cr_r_tab = (int *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (MAXJSAMPLE+1) * sizeof(int));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(int));
cconvert->Cb_b_tab = (int *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (MAXJSAMPLE+1) * sizeof(int));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(int));
cconvert->Cr_g_tab = (JLONG *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (MAXJSAMPLE+1) * sizeof(JLONG));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(JLONG));
cconvert->Cb_g_tab = (JLONG *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (MAXJSAMPLE+1) * sizeof(JLONG));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(JLONG));
for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
/* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -238,10 +238,10 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
cconvert->Cb_b_tab[i] = (int)
RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
/* Cr=>G value is scaled-up -0.71414 * x */
- cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x;
+ cconvert->Cr_g_tab[i] = (-FIX(0.71414)) * x;
/* Cb=>G value is scaled-up -0.34414 * x */
/* We also add in ONE_HALF so that need not do it in inner loop */
- cconvert->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
+ cconvert->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
}
}
@@ -251,43 +251,42 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-ycc_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_BGR:
- ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- default:
- ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
+ case JCS_EXT_RGB:
+ ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ default:
+ ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
}
}
@@ -300,21 +299,21 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
*/
LOCAL(void)
-build_rgb_y_table (j_decompress_ptr cinfo)
+build_rgb_y_table(j_decompress_ptr cinfo)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
JLONG *rgb_y_tab;
JLONG i;
/* Allocate and fill in the conversion tables. */
cconvert->rgb_y_tab = rgb_y_tab = (JLONG *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
(TABLE_SIZE * sizeof(JLONG)));
for (i = 0; i <= MAXJSAMPLE; i++) {
- rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i;
- rgb_y_tab[i+G_Y_OFF] = FIX(0.58700) * i;
- rgb_y_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
+ rgb_y_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+ rgb_y_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+ rgb_y_tab[i + B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
}
}
@@ -324,11 +323,10 @@ build_rgb_y_table (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-rgb_gray_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
register int r, g, b;
register JLONG *ctab = cconvert->rgb_y_tab;
register JSAMPROW outptr;
@@ -343,13 +341,12 @@ rgb_gray_convert (j_decompress_ptr cinfo,
input_row++;
outptr = *output_buf++;
for (col = 0; col < num_cols; col++) {
- r = GETJSAMPLE(inptr0[col]);
- g = GETJSAMPLE(inptr1[col]);
- b = GETJSAMPLE(inptr2[col]);
+ r = inptr0[col];
+ g = inptr1[col];
+ b = inptr2[col];
/* Y */
- outptr[col] = (JSAMPLE)
- ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
- >> SCALEBITS);
+ outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+ ctab[b + B_Y_OFF]) >> SCALEBITS);
}
}
}
@@ -361,9 +358,8 @@ rgb_gray_convert (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-null_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+null_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
register JSAMPROW inptr, inptr0, inptr1, inptr2, inptr3, outptr;
register JDIMENSION col;
@@ -423,12 +419,11 @@ null_convert (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-grayscale_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+grayscale_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
- jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0,
- num_rows, cinfo->output_width);
+ jcopy_sample_rows(input_buf[0], (int)input_row, output_buf, 0, num_rows,
+ cinfo->output_width);
}
@@ -437,43 +432,42 @@ grayscale_convert (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-gray_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+gray_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_BGR:
- gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- default:
- gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
+ case JCS_EXT_RGB:
+ gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ default:
+ gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
}
}
@@ -483,43 +477,42 @@ gray_rgb_convert (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-rgb_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+rgb_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_BGR:
- rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
- default:
- rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
- num_rows);
- break;
+ case JCS_EXT_RGB:
+ rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ default:
+ rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
}
}
@@ -532,11 +525,10 @@ rgb_rgb_convert (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-ycck_cmyk_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
- my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
register int y, cb, cr;
register JSAMPROW outptr;
register JSAMPROW inptr0, inptr1, inptr2, inptr3;
@@ -558,17 +550,17 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
input_row++;
outptr = *output_buf++;
for (col = 0; col < num_cols; col++) {
- y = GETJSAMPLE(inptr0[col]);
- cb = GETJSAMPLE(inptr1[col]);
- cr = GETJSAMPLE(inptr2[col]);
+ y = inptr0[col];
+ cb = inptr1[col];
+ cr = inptr2[col];
/* Range-limiting is essential due to noise introduced by DCT losses. */
outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])]; /* red */
outptr[1] = range_limit[MAXJSAMPLE - (y + /* green */
- ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
SCALEBITS)))];
outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])]; /* blue */
/* K passes through unchanged */
- outptr[3] = inptr3[col]; /* don't need GETJSAMPLE here */
+ outptr[3] = inptr3[col];
outptr += 4;
}
}
@@ -579,16 +571,15 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
* RGB565 conversion
*/
-#define PACK_SHORT_565_LE(r, g, b) ((((r) << 8) & 0xF800) | \
- (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_SHORT_565_BE(r, g, b) (((r) & 0xF8) | ((g) >> 5) | \
- (((g) << 11) & 0xE000) | \
- (((b) << 5) & 0x1F00))
+#define PACK_SHORT_565_LE(r, g, b) \
+ ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b) \
+ (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
-#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l)
-#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r)
+#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r)
-#define PACK_NEED_ALIGNMENT(ptr) (((size_t)(ptr)) & 3)
+#define PACK_NEED_ALIGNMENT(ptr) (((size_t)(ptr)) & 3)
#define WRITE_TWO_ALIGNED_PIXELS(addr, pixels) ((*(int *)(addr)) = pixels)
@@ -600,7 +591,7 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
/* Declarations for ordered dithering
*
* We use a 4x4 ordered dither array packed into 32 bits. This array is
- * sufficent for dithering RGB888 to RGB565.
+ * sufficient for dithering RGB888 to RGB565.
*/
#define DITHER_MASK 0x3
@@ -616,7 +607,7 @@ static const JLONG dither_matrix[4] = {
static INLINE boolean is_big_endian(void)
{
int test_value = 1;
- if(*(char *)&test_value != 1)
+ if (*(char *)&test_value != 1)
return TRUE;
return FALSE;
}
@@ -624,14 +615,14 @@ static INLINE boolean is_big_endian(void)
/* Include inline routines for RGB565 conversion */
-#define PACK_SHORT_565 PACK_SHORT_565_LE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
-#define ycc_rgb565_convert_internal ycc_rgb565_convert_le
-#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_le
-#define rgb_rgb565_convert_internal rgb_rgb565_convert_le
-#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_le
-#define gray_rgb565_convert_internal gray_rgb565_convert_le
-#define gray_rgb565D_convert_internal gray_rgb565D_convert_le
+#define PACK_SHORT_565 PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
+#define ycc_rgb565_convert_internal ycc_rgb565_convert_le
+#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_le
+#define rgb_rgb565_convert_internal rgb_rgb565_convert_le
+#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_le
+#define gray_rgb565_convert_internal gray_rgb565_convert_le
+#define gray_rgb565D_convert_internal gray_rgb565D_convert_le
#include "jdcol565.c"
#undef PACK_SHORT_565
#undef PACK_TWO_PIXELS
@@ -642,14 +633,14 @@ static INLINE boolean is_big_endian(void)
#undef gray_rgb565_convert_internal
#undef gray_rgb565D_convert_internal
-#define PACK_SHORT_565 PACK_SHORT_565_BE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
-#define ycc_rgb565_convert_internal ycc_rgb565_convert_be
-#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_be
-#define rgb_rgb565_convert_internal rgb_rgb565_convert_be
-#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_be
-#define gray_rgb565_convert_internal gray_rgb565_convert_be
-#define gray_rgb565D_convert_internal gray_rgb565D_convert_be
+#define PACK_SHORT_565 PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
+#define ycc_rgb565_convert_internal ycc_rgb565_convert_be
+#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_be
+#define rgb_rgb565_convert_internal rgb_rgb565_convert_be
+#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_be
+#define gray_rgb565_convert_internal gray_rgb565_convert_be
+#define gray_rgb565D_convert_internal gray_rgb565D_convert_be
#include "jdcol565.c"
#undef PACK_SHORT_565
#undef PACK_TWO_PIXELS
@@ -662,9 +653,8 @@ static INLINE boolean is_big_endian(void)
METHODDEF(void)
-ycc_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
if (is_big_endian())
ycc_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -674,9 +664,8 @@ ycc_rgb565_convert (j_decompress_ptr cinfo,
METHODDEF(void)
-ycc_rgb565D_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
if (is_big_endian())
ycc_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -686,9 +675,8 @@ ycc_rgb565D_convert (j_decompress_ptr cinfo,
METHODDEF(void)
-rgb_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
if (is_big_endian())
rgb_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -698,9 +686,8 @@ rgb_rgb565_convert (j_decompress_ptr cinfo,
METHODDEF(void)
-rgb_rgb565D_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
if (is_big_endian())
rgb_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -710,9 +697,8 @@ rgb_rgb565D_convert (j_decompress_ptr cinfo,
METHODDEF(void)
-gray_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+gray_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
if (is_big_endian())
gray_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -722,9 +708,8 @@ gray_rgb565_convert (j_decompress_ptr cinfo,
METHODDEF(void)
-gray_rgb565D_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+gray_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
{
if (is_big_endian())
gray_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -738,7 +723,7 @@ gray_rgb565D_convert (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-start_pass_dcolor (j_decompress_ptr cinfo)
+start_pass_dcolor(j_decompress_ptr cinfo)
{
/* no work needed */
}
@@ -749,15 +734,15 @@ start_pass_dcolor (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jinit_color_deconverter (j_decompress_ptr cinfo)
+jinit_color_deconverter(j_decompress_ptr cinfo)
{
my_cconvert_ptr cconvert;
int ci;
cconvert = (my_cconvert_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_color_deconverter));
- cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert;
+ cinfo->cconvert = (struct jpeg_color_deconverter *)cconvert;
cconvert->pub.start_pass = start_pass_dcolor;
/* Make sure num_components agrees with jpeg_color_space */
@@ -843,11 +828,11 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
cinfo->out_color_components = 3;
if (cinfo->dither_mode == JDITHER_NONE) {
if (cinfo->jpeg_color_space == JCS_YCbCr) {
- if (jsimd_can_ycc_rgb565())
- cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
- else {
- cconvert->pub.color_convert = ycc_rgb565_convert;
- build_ycc_rgb_table(cinfo);
+ if (jsimd_can_ycc_rgb565())
+ cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
+ else {
+ cconvert->pub.color_convert = ycc_rgb565_convert;
+ build_ycc_rgb_table(cinfo);
}
} else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
cconvert->pub.color_convert = gray_rgb565_convert;
diff --git a/media/libjpeg/jdct.h b/media/libjpeg/jdct.h
index faf8e1cf03..66d1718b77 100644
--- a/media/libjpeg/jdct.h
+++ b/media/libjpeg/jdct.h
@@ -36,7 +36,7 @@ typedef int DCTELEM; /* 16 or 32 bits is fine */
typedef unsigned int UDCTELEM;
typedef unsigned long long UDCTELEM2;
#else
-typedef short DCTELEM; /* prefer 16 bit with SIMD for parellelism */
+typedef short DCTELEM; /* prefer 16 bit with SIMD for parellelism */
typedef unsigned short UDCTELEM;
typedef unsigned int UDCTELEM2;
#endif
@@ -63,15 +63,15 @@ typedef unsigned long long UDCTELEM2;
* Each IDCT routine has its own ideas about the best dct_table element type.
*/
-typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
+typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
#if BITS_IN_JSAMPLE == 8
-typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
-#define IFAST_SCALE_BITS 2 /* fractional bits in scale factors */
+typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
+#define IFAST_SCALE_BITS 2 /* fractional bits in scale factors */
#else
-typedef JLONG IFAST_MULT_TYPE; /* need 32 bits for scaled quantizers */
-#define IFAST_SCALE_BITS 13 /* fractional bits in scale factors */
+typedef JLONG IFAST_MULT_TYPE; /* need 32 bits for scaled quantizers */
+#define IFAST_SCALE_BITS 13 /* fractional bits in scale factors */
#endif
-typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
+typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
/*
@@ -90,64 +90,64 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
/* Extern declarations for the forward and inverse DCT routines. */
-EXTERN(void) jpeg_fdct_islow (DCTELEM *data);
-EXTERN(void) jpeg_fdct_ifast (DCTELEM *data);
-EXTERN(void) jpeg_fdct_float (FAST_FLOAT *data);
-
-EXTERN(void) jpeg_idct_islow
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_ifast
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_float
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_7x7
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_6x6
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_5x5
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_4x4
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_3x3
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_2x2
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_1x1
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_9x9
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_10x10
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_11x11
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_12x12
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_13x13
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_14x14
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_15x15
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_16x16
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_fdct_islow(DCTELEM *data);
+EXTERN(void) jpeg_fdct_ifast(DCTELEM *data);
+EXTERN(void) jpeg_fdct_float(FAST_FLOAT *data);
+
+EXTERN(void) jpeg_idct_islow(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_ifast(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_float(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_7x7(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_6x6(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_5x5(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_4x4(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_3x3(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_2x2(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_1x1(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_9x9(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_10x10(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_11x11(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_12x12(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_13x13(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_14x14(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_15x15(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_16x16(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
/*
@@ -160,22 +160,22 @@ EXTERN(void) jpeg_idct_16x16
* and may differ from one module to the next.
*/
-#define ONE ((JLONG) 1)
-#define CONST_SCALE (ONE << CONST_BITS)
+#define ONE ((JLONG)1)
+#define CONST_SCALE (ONE << CONST_BITS)
/* Convert a positive real constant to an integer scaled by CONST_SCALE.
* Caution: some C compilers fail to reduce "FIX(constant)" at compile time,
* thus causing a lot of useless floating-point operations at run time.
*/
-#define FIX(x) ((JLONG) ((x) * CONST_SCALE + 0.5))
+#define FIX(x) ((JLONG)((x) * CONST_SCALE + 0.5))
/* Descale and correctly round a JLONG value that's scaled by N bits.
* We assume RIGHT_SHIFT rounds towards minus infinity, so adding
* the fudge factor is correct for either sign of X.
*/
-#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
+#define DESCALE(x, n) RIGHT_SHIFT((x) + (ONE << ((n) - 1)), n)
/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
* This macro is used only when the two inputs will actually be no more than
@@ -187,22 +187,22 @@ EXTERN(void) jpeg_idct_16x16
*/
#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
-#define MULTIPLY16C16(var,const) (((INT16) (var)) * ((INT16) (const)))
+#define MULTIPLY16C16(var, const) (((INT16)(var)) * ((INT16)(const)))
#endif
#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */
-#define MULTIPLY16C16(var,const) (((INT16) (var)) * ((JLONG) (const)))
+#define MULTIPLY16C16(var, const) (((INT16)(var)) * ((JLONG)(const)))
#endif
#ifndef MULTIPLY16C16 /* default definition */
-#define MULTIPLY16C16(var,const) ((var) * (const))
+#define MULTIPLY16C16(var, const) ((var) * (const))
#endif
/* Same except both inputs are variables. */
#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
-#define MULTIPLY16V16(var1,var2) (((INT16) (var1)) * ((INT16) (var2)))
+#define MULTIPLY16V16(var1, var2) (((INT16)(var1)) * ((INT16)(var2)))
#endif
#ifndef MULTIPLY16V16 /* default definition */
-#define MULTIPLY16V16(var1,var2) ((var1) * (var2))
+#define MULTIPLY16V16(var1, var2) ((var1) * (var2))
#endif
diff --git a/media/libjpeg/jddctmgr.c b/media/libjpeg/jddctmgr.c
index 3a5ba7e893..e78d7bebe2 100644
--- a/media/libjpeg/jddctmgr.c
+++ b/media/libjpeg/jddctmgr.c
@@ -6,7 +6,7 @@
* Modified 2002-2010 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015, D. R. Commander.
+ * Copyright (C) 2010, 2015, 2022, D. R. Commander.
* Copyright (C) 2013, MIPS Technologies, Inc., California.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
@@ -94,9 +94,9 @@ typedef union {
*/
METHODDEF(void)
-start_pass (j_decompress_ptr cinfo)
+start_pass(j_decompress_ptr cinfo)
{
- my_idct_ptr idct = (my_idct_ptr) cinfo->idct;
+ my_idct_ptr idct = (my_idct_ptr)cinfo->idct;
int ci, i;
jpeg_component_info *compptr;
int method = 0;
@@ -233,7 +233,7 @@ start_pass (j_decompress_ptr cinfo)
* multiplier table all-zero; we'll be reading zeroes from the
* coefficient controller's buffer anyway.
*/
- if (! compptr->component_needed || idct->cur_method[ci] == method)
+ if (!compptr->component_needed || idct->cur_method[ci] == method)
continue;
qtbl = compptr->quant_table;
if (qtbl == NULL) /* happens if no data yet for component */
@@ -246,9 +246,9 @@ start_pass (j_decompress_ptr cinfo)
/* For LL&M IDCT method, multipliers are equal to raw quantization
* coefficients, but are stored as ints to ensure access efficiency.
*/
- ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *)compptr->dct_table;
for (i = 0; i < DCTSIZE2; i++) {
- ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i];
+ ismtbl[i] = (ISLOW_MULT_TYPE)qtbl->quantval[i];
}
}
break;
@@ -263,8 +263,8 @@ start_pass (j_decompress_ptr cinfo)
* For integer operation, the multiplier table is to be scaled by
* IFAST_SCALE_BITS.
*/
- IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
-#define CONST_BITS 14
+ IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *)compptr->dct_table;
+#define CONST_BITS 14
static const INT16 aanscales[DCTSIZE2] = {
/* precomputed values scaled up by 14 bits */
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
@@ -280,9 +280,9 @@ start_pass (j_decompress_ptr cinfo)
for (i = 0; i < DCTSIZE2; i++) {
ifmtbl[i] = (IFAST_MULT_TYPE)
- DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
- (JLONG) aanscales[i]),
- CONST_BITS-IFAST_SCALE_BITS);
+ DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+ (JLONG)aanscales[i]),
+ CONST_BITS - IFAST_SCALE_BITS);
}
}
break;
@@ -295,7 +295,7 @@ start_pass (j_decompress_ptr cinfo)
* scalefactor[0] = 1
* scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
*/
- FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
+ FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *)compptr->dct_table;
int row, col;
static const double aanscalefactor[DCTSIZE] = {
1.0, 1.387039845, 1.306562965, 1.175875602,
@@ -306,7 +306,7 @@ start_pass (j_decompress_ptr cinfo)
for (row = 0; row < DCTSIZE; row++) {
for (col = 0; col < DCTSIZE; col++) {
fmtbl[i] = (FLOAT_MULT_TYPE)
- ((double) qtbl->quantval[i] *
+ ((double)qtbl->quantval[i] *
aanscalefactor[row] * aanscalefactor[col]);
i++;
}
@@ -327,25 +327,25 @@ start_pass (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jinit_inverse_dct (j_decompress_ptr cinfo)
+jinit_inverse_dct(j_decompress_ptr cinfo)
{
my_idct_ptr idct;
int ci;
jpeg_component_info *compptr;
idct = (my_idct_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_idct_controller));
- cinfo->idct = (struct jpeg_inverse_dct *) idct;
+ cinfo->idct = (struct jpeg_inverse_dct *)idct;
idct->pub.start_pass = start_pass;
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
/* Allocate and pre-zero a multiplier table for each component */
compptr->dct_table =
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(multiplier_table));
- MEMZERO(compptr->dct_table, sizeof(multiplier_table));
+ memset(compptr->dct_table, 0, sizeof(multiplier_table));
/* Mark multiplier table not yet set up for any method */
idct->cur_method[ci] = -1;
}
diff --git a/media/libjpeg/jdhuff.c b/media/libjpeg/jdhuff.c
index fa78e2962a..679d221685 100644
--- a/media/libjpeg/jdhuff.c
+++ b/media/libjpeg/jdhuff.c
@@ -4,7 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -15,6 +16,9 @@
* up to the start of the current MCU. To do this, we copy state variables
* into local working storage, and update them back to the permanent
* storage only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
@@ -36,24 +40,6 @@ typedef struct {
int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
} savable_state;
-/* This macro is to work around compilers with missing or broken
- * structure assignment. You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src) ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src) \
- ((dest).last_dc_val[0] = (src).last_dc_val[0], \
- (dest).last_dc_val[1] = (src).last_dc_val[1], \
- (dest).last_dc_val[2] = (src).last_dc_val[2], \
- (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
typedef struct {
struct jpeg_entropy_decoder pub; /* public fields */
@@ -88,9 +74,9 @@ typedef huff_entropy_decoder *huff_entropy_ptr;
*/
METHODDEF(void)
-start_pass_huff_decoder (j_decompress_ptr cinfo)
+start_pass_huff_decoder(j_decompress_ptr cinfo)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
int ci, blkn, dctbl, actbl;
d_derived_tbl **pdtbl;
jpeg_component_info *compptr;
@@ -99,7 +85,7 @@ start_pass_huff_decoder (j_decompress_ptr cinfo)
* This ought to be an error condition, but we make it a warning because
* there are some baseline files out there with all zeroes in these bytes.
*/
- if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
+ if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
cinfo->Ah != 0 || cinfo->Al != 0)
WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
@@ -152,8 +138,8 @@ start_pass_huff_decoder (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
- d_derived_tbl **pdtbl)
+jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC, int tblno,
+ d_derived_tbl **pdtbl)
{
JHUFF_TBL *htbl;
d_derived_tbl *dtbl;
@@ -178,7 +164,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
/* Allocate a workspace if we haven't already done so. */
if (*pdtbl == NULL)
*pdtbl = (d_derived_tbl *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(d_derived_tbl));
dtbl = *pdtbl;
dtbl->pub = htbl; /* fill in back link */
@@ -187,11 +173,11 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
p = 0;
for (l = 1; l <= 16; l++) {
- i = (int) htbl->bits[l];
+ i = (int)htbl->bits[l];
if (i < 0 || p + i > 256) /* protect against table overrun */
ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
while (i--)
- huffsize[p++] = (char) l;
+ huffsize[p++] = (char)l;
}
huffsize[p] = 0;
numsymbols = p;
@@ -203,14 +189,14 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
si = huffsize[0];
p = 0;
while (huffsize[p]) {
- while (((int) huffsize[p]) == si) {
+ while (((int)huffsize[p]) == si) {
huffcode[p++] = code;
code++;
}
/* code is now 1 more than the last code used for codelength si; but
* it must still fit in si bits, since no code is allowed to be all ones.
*/
- if (((JLONG) code) >= (((JLONG) 1) << si))
+ if (((JLONG)code) >= (((JLONG)1) << si))
ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
code <<= 1;
si++;
@@ -224,9 +210,9 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
/* valoffset[l] = huffval[] index of 1st symbol of code length l,
* minus the minimum code of length l
*/
- dtbl->valoffset[l] = (JLONG) p - (JLONG) huffcode[p];
+ dtbl->valoffset[l] = (JLONG)p - (JLONG)huffcode[p];
p += htbl->bits[l];
- dtbl->maxcode[l] = huffcode[p-1]; /* maximum code of length l */
+ dtbl->maxcode[l] = huffcode[p - 1]; /* maximum code of length l */
} else {
dtbl->maxcode[l] = -1; /* -1 if no codes of this length */
}
@@ -241,16 +227,16 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
* with that code.
*/
- for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
- dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
+ for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
+ dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
p = 0;
for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
- for (i = 1; i <= (int) htbl->bits[l]; i++, p++) {
+ for (i = 1; i <= (int)htbl->bits[l]; i++, p++) {
/* l = current code's length, p = its index in huffcode[] & huffval[]. */
/* Generate left-justified code followed by all possible bit sequences */
- lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
- for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
+ lookbits = huffcode[p] << (HUFF_LOOKAHEAD - l);
+ for (ctr = 1 << (HUFF_LOOKAHEAD - l); ctr > 0; ctr--) {
dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
lookbits++;
}
@@ -291,14 +277,14 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
#ifdef SLOW_SHIFT_32
#define MIN_GET_BITS 15 /* minimum allowable value */
#else
-#define MIN_GET_BITS (BIT_BUF_SIZE-7)
+#define MIN_GET_BITS (BIT_BUF_SIZE - 7)
#endif
GLOBAL(boolean)
-jpeg_fill_bit_buffer (bitread_working_state *state,
- register bit_buf_type get_buffer, register int bits_left,
- int nbits)
+jpeg_fill_bit_buffer(bitread_working_state *state,
+ register bit_buf_type get_buffer, register int bits_left,
+ int nbits)
/* Load up the bit buffer to a depth of at least nbits */
{
/* Copy heavily used state fields into locals (hopefully registers) */
@@ -316,13 +302,13 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
/* Attempt to read a byte */
if (bytes_in_buffer == 0) {
- if (! (*cinfo->src->fill_input_buffer) (cinfo))
+ if (!(*cinfo->src->fill_input_buffer) (cinfo))
return FALSE;
next_input_byte = cinfo->src->next_input_byte;
bytes_in_buffer = cinfo->src->bytes_in_buffer;
}
bytes_in_buffer--;
- c = GETJOCTET(*next_input_byte++);
+ c = *next_input_byte++;
/* If it's 0xFF, check and discard stuffed zero byte */
if (c == 0xFF) {
@@ -333,13 +319,13 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
*/
do {
if (bytes_in_buffer == 0) {
- if (! (*cinfo->src->fill_input_buffer) (cinfo))
+ if (!(*cinfo->src->fill_input_buffer) (cinfo))
return FALSE;
next_input_byte = cinfo->src->next_input_byte;
bytes_in_buffer = cinfo->src->bytes_in_buffer;
}
bytes_in_buffer--;
- c = GETJOCTET(*next_input_byte++);
+ c = *next_input_byte++;
} while (c == 0xFF);
if (c == 0) {
@@ -365,7 +351,7 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
bits_left += 8;
} /* end while */
} else {
- no_more_bytes:
+no_more_bytes:
/* We get here if we've read the marker that terminates the compressed
* data segment. There should be enough bits in the buffer register
* to satisfy the request; if so, no problem.
@@ -376,7 +362,7 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
* We use a nonvolatile flag to ensure that only one warning message
* appears per data segment.
*/
- if (! cinfo->entropy->insufficient_data) {
+ if (!cinfo->entropy->insufficient_data) {
WARNMS(cinfo, JWRN_HIT_MARKER);
cinfo->entropy->insufficient_data = TRUE;
}
@@ -400,11 +386,10 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
handle markers. We have to hand off any blocks with markers to the
slower routines. */
-#define GET_BYTE \
-{ \
+#define GET_BYTE { \
register int c0, c1; \
- c0 = GETJOCTET(*buffer++); \
- c1 = GETJOCTET(*buffer); \
+ c0 = *buffer++; \
+ c1 = *buffer; \
/* Pre-execute most common case */ \
get_buffer = (get_buffer << 8) | c0; \
bits_left += 8; \
@@ -421,7 +406,7 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
} \
}
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__))
/* Pre-fetch 48 bytes, because the holding register is 64-bit */
#define FILL_BIT_BUFFER_FAST \
@@ -446,9 +431,9 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
*/
GLOBAL(int)
-jpeg_huff_decode (bitread_working_state *state,
- register bit_buf_type get_buffer, register int bits_left,
- d_derived_tbl *htbl, int min_bits)
+jpeg_huff_decode(bitread_working_state *state,
+ register bit_buf_type get_buffer, register int bits_left,
+ d_derived_tbl *htbl, int min_bits)
{
register int l = min_bits;
register JLONG code;
@@ -460,7 +445,7 @@ jpeg_huff_decode (bitread_working_state *state,
code = GET_BITS(l);
/* Collect the rest of the Huffman code one bit at a time. */
- /* This is per Figure F.16 in the JPEG spec. */
+ /* This is per Figure F.16. */
while (code > htbl->maxcode[l]) {
code <<= 1;
@@ -480,7 +465,7 @@ jpeg_huff_decode (bitread_working_state *state,
return 0; /* fake a zero as the safest result */
}
- return htbl->pub->huffval[ (int) (code + htbl->valoffset[l]) ];
+ return htbl->pub->huffval[(int)(code + htbl->valoffset[l])];
}
@@ -492,22 +477,26 @@ jpeg_huff_decode (bitread_working_state *state,
#define AVOID_TABLES
#ifdef AVOID_TABLES
-#define NEG_1 ((unsigned int)-1)
-#define HUFF_EXTEND(x,s) ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((NEG_1)<<(s)) + 1)))
+#define NEG_1 ((unsigned int)-1)
+#define HUFF_EXTEND(x, s) \
+ ((x) + ((((x) - (1 << ((s) - 1))) >> 31) & (((NEG_1) << (s)) + 1)))
#else
-#define HUFF_EXTEND(x,s) ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+#define HUFF_EXTEND(x, s) \
+ ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-static const int extend_test[16] = /* entry n is 2**(n-1) */
- { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
- 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int extend_test[16] = { /* entry n is 2**(n-1) */
+ 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+ 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
- { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
- ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
- ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
- ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+ 0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+ ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+ ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+ ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
#endif /* AVOID_TABLES */
@@ -518,9 +507,9 @@ static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
*/
LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
int ci;
/* Throw away any unused bits remaining in bit buffer; */
@@ -529,7 +518,7 @@ process_restart (j_decompress_ptr cinfo)
entropy->bitstate.bits_left = 0;
/* Advance past the RSTn marker */
- if (! (*cinfo->marker->read_restart_marker) (cinfo))
+ if (!(*cinfo->marker->read_restart_marker) (cinfo))
return FALSE;
/* Re-initialize DC predictions to 0 */
@@ -551,18 +540,24 @@ process_restart (j_decompress_ptr cinfo)
}
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+ no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
LOCAL(boolean)
-decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
BITREAD_STATE_VARS;
int blkn;
savable_state state;
/* Outer loop handles each block in the MCU */
/* Load up working state */
- BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
- ASSIGN_STATE(state, entropy->saved);
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+ state = entropy->saved;
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -583,11 +578,19 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (entropy->dc_needed[blkn]) {
/* Convert DC difference to actual value, update last_dc_val */
int ci = cinfo->MCU_membership[blkn];
+ /* Certain malformed JPEG images produce repeated DC coefficient
+ * differences of 2047 or -2047, which causes state.last_dc_val[ci] to
+ * grow until it overflows or underflows a 32-bit signed integer. This
+ * behavior is, to the best of our understanding, innocuous, and it is
+ * unclear how to work around it without potentially affecting
+ * performance. Thus, we (hopefully temporarily) suppress UBSan integer
+ * overflow errors for this function and decode_mcu_fast().
+ */
s += state.last_dc_val[ci];
state.last_dc_val[ci] = s;
if (block) {
/* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
- (*block)[0] = (JCOEF) s;
+ (*block)[0] = (JCOEF)s;
}
}
@@ -610,7 +613,7 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
* Note: the extra entries in jpeg_natural_order[] will save us
* if k >= DCTSIZE2, which could happen if the data is corrupted.
*/
- (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+ (*block)[jpeg_natural_order[k]] = (JCOEF)s;
} else {
if (r != 15)
break;
@@ -642,16 +645,22 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
}
/* Completed MCU, so update state */
- BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
- ASSIGN_STATE(entropy->saved, state);
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+ entropy->saved = state;
return TRUE;
}
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+ no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
LOCAL(boolean)
-decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
BITREAD_STATE_VARS;
JOCTET *buffer;
int blkn;
@@ -659,9 +668,9 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
/* Outer loop handles each block in the MCU */
/* Load up working state */
- BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
- buffer = (JOCTET *) br_state.next_input_byte;
- ASSIGN_STATE(state, entropy->saved);
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+ buffer = (JOCTET *)br_state.next_input_byte;
+ state = entropy->saved;
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -669,7 +678,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
register int s, k, r, l;
- HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu);
+ HUFF_DECODE_FAST(s, l, dctbl);
if (s) {
FILL_BIT_BUFFER_FAST
r = GET_BITS(s);
@@ -678,16 +687,19 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (entropy->dc_needed[blkn]) {
int ci = cinfo->MCU_membership[blkn];
+ /* Refer to the comment in decode_mcu_slow() regarding the supression of
+ * a UBSan integer overflow error in this line of code.
+ */
s += state.last_dc_val[ci];
state.last_dc_val[ci] = s;
if (block)
- (*block)[0] = (JCOEF) s;
+ (*block)[0] = (JCOEF)s;
}
if (entropy->ac_needed[blkn] && block) {
for (k = 1; k < DCTSIZE2; k++) {
- HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
+ HUFF_DECODE_FAST(s, l, actbl);
r = s >> 4;
s &= 15;
@@ -696,7 +708,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
FILL_BIT_BUFFER_FAST
r = GET_BITS(s);
s = HUFF_EXTEND(r, s);
- (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+ (*block)[jpeg_natural_order[k]] = (JCOEF)s;
} else {
if (r != 15) break;
k += 15;
@@ -706,7 +718,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
} else {
for (k = 1; k < DCTSIZE2; k++) {
- HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
+ HUFF_DECODE_FAST(s, l, actbl);
r = s >> 4;
s &= 15;
@@ -723,15 +735,14 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
}
if (cinfo->unread_marker != 0) {
-slow_decode_mcu:
cinfo->unread_marker = 0;
return FALSE;
}
br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
br_state.next_input_byte = buffer;
- BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
- ASSIGN_STATE(entropy->saved, state);
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+ entropy->saved = state;
return TRUE;
}
@@ -751,43 +762,43 @@ slow_decode_mcu:
* this module, since we'll just re-assign them on the next call.)
*/
-#define BUFSIZE (DCTSIZE2 * 8)
+#define BUFSIZE (DCTSIZE2 * 8)
METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
int usefast = 1;
/* Process restart marker if needed; may have to suspend */
if (cinfo->restart_interval) {
if (entropy->restarts_to_go == 0)
- if (! process_restart(cinfo))
+ if (!process_restart(cinfo))
return FALSE;
usefast = 0;
}
- if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU
- || cinfo->unread_marker != 0)
+ if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU ||
+ cinfo->unread_marker != 0)
usefast = 0;
/* If we've run out of data, just leave the MCU set to zeroes.
* This way, we return uniform gray for the remainder of the segment.
*/
- if (! entropy->pub.insufficient_data) {
+ if (!entropy->pub.insufficient_data) {
if (usefast) {
if (!decode_mcu_fast(cinfo, MCU_data)) goto use_slow;
- }
- else {
- use_slow:
+ } else {
+use_slow:
if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE;
}
}
/* Account for restart interval (no-op if not using restarts) */
- entropy->restarts_to_go--;
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
return TRUE;
}
@@ -798,7 +809,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
GLOBAL(void)
-jinit_huff_decoder (j_decompress_ptr cinfo)
+jinit_huff_decoder(j_decompress_ptr cinfo)
{
huff_entropy_ptr entropy;
int i;
@@ -807,12 +818,12 @@ jinit_huff_decoder (j_decompress_ptr cinfo)
are the default tables. Thus, if the tables are not set by the time
the Huffman decoder is initialized (usually within the body of
jpeg_start_decompress()), we set them to default values. */
- std_huff_tables((j_common_ptr) cinfo);
+ std_huff_tables((j_common_ptr)cinfo);
entropy = (huff_entropy_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(huff_entropy_decoder));
- cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+ cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
entropy->pub.start_pass = start_pass_huff_decoder;
entropy->pub.decode_mcu = decode_mcu;
diff --git a/media/libjpeg/jdhuff.h b/media/libjpeg/jdhuff.h
index 3f15d71a81..cfa0b7f558 100644
--- a/media/libjpeg/jdhuff.h
+++ b/media/libjpeg/jdhuff.h
@@ -4,7 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010-2011, 2015-2016, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -43,13 +44,12 @@ typedef struct {
* if too long. The next 8 bits of each entry contain the
* symbol.
*/
- int lookup[1<<HUFF_LOOKAHEAD];
+ int lookup[1 << HUFF_LOOKAHEAD];
} d_derived_tbl;
/* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_d_derived_tbl
- (j_decompress_ptr cinfo, boolean isDC, int tblno,
- d_derived_tbl ** pdtbl);
+EXTERN(void) jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC,
+ int tblno, d_derived_tbl **pdtbl);
/*
@@ -74,11 +74,16 @@ EXTERN(void) jpeg_make_d_derived_tbl
#error Cannot determine word size
#endif
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
typedef size_t bit_buf_type; /* type of bit-extraction buffer */
#define BIT_BUF_SIZE 64 /* size of buffer in bits */
+#elif defined(__x86_64__) && defined(__ILP32__)
+
+typedef unsigned long long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE 64 /* size of buffer in bits */
+
#else
typedef unsigned long bit_buf_type; /* type of bit-extraction buffer */
@@ -113,23 +118,23 @@ typedef struct { /* Bitreading working state within an MCU */
} bitread_working_state;
/* Macros to declare and load/save bitread local variables. */
-#define BITREAD_STATE_VARS \
- register bit_buf_type get_buffer; \
- register int bits_left; \
- bitread_working_state br_state
-
-#define BITREAD_LOAD_STATE(cinfop,permstate) \
- br_state.cinfo = cinfop; \
- br_state.next_input_byte = cinfop->src->next_input_byte; \
- br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
- get_buffer = permstate.get_buffer; \
- bits_left = permstate.bits_left;
-
-#define BITREAD_SAVE_STATE(cinfop,permstate) \
- cinfop->src->next_input_byte = br_state.next_input_byte; \
- cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
- permstate.get_buffer = get_buffer; \
- permstate.bits_left = bits_left
+#define BITREAD_STATE_VARS \
+ register bit_buf_type get_buffer; \
+ register int bits_left; \
+ bitread_working_state br_state
+
+#define BITREAD_LOAD_STATE(cinfop, permstate) \
+ br_state.cinfo = cinfop; \
+ br_state.next_input_byte = cinfop->src->next_input_byte; \
+ br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+ get_buffer = permstate.get_buffer; \
+ bits_left = permstate.bits_left;
+
+#define BITREAD_SAVE_STATE(cinfop, permstate) \
+ cinfop->src->next_input_byte = br_state.next_input_byte; \
+ cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+ permstate.get_buffer = get_buffer; \
+ permstate.bits_left = bits_left
/*
* These macros provide the in-line portion of bit fetching.
@@ -137,7 +142,7 @@ typedef struct { /* Bitreading working state within an MCU */
* before using GET_BITS, PEEK_BITS, or DROP_BITS.
* The variables get_buffer and bits_left are assumed to be locals,
* but the state struct might not be (jpeg_huff_decode needs this).
- * CHECK_BIT_BUFFER(state,n,action);
+ * CHECK_BIT_BUFFER(state, n, action);
* Ensure there are N bits in get_buffer; if suspend, take action.
* val = GET_BITS(n);
* Fetch next N bits.
@@ -149,25 +154,27 @@ typedef struct { /* Bitreading working state within an MCU */
* is evaluated multiple times.
*/
-#define CHECK_BIT_BUFFER(state,nbits,action) \
- { if (bits_left < (nbits)) { \
- if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits)) \
- { action; } \
- get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+#define CHECK_BIT_BUFFER(state, nbits, action) { \
+ if (bits_left < (nbits)) { \
+ if (!jpeg_fill_bit_buffer(&(state), get_buffer, bits_left, nbits)) \
+ { action; } \
+ get_buffer = (state).get_buffer; bits_left = (state).bits_left; \
+ } \
+}
#define GET_BITS(nbits) \
- (((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
+ (((int)(get_buffer >> (bits_left -= (nbits)))) & ((1 << (nbits)) - 1))
#define PEEK_BITS(nbits) \
- (((int) (get_buffer >> (bits_left - (nbits)))) & ((1<<(nbits))-1))
+ (((int)(get_buffer >> (bits_left - (nbits)))) & ((1 << (nbits)) - 1))
#define DROP_BITS(nbits) \
- (bits_left -= (nbits))
+ (bits_left -= (nbits))
/* Load up the bit buffer to a depth of at least nbits */
-EXTERN(boolean) jpeg_fill_bit_buffer
- (bitread_working_state *state, register bit_buf_type get_buffer,
- register int bits_left, int nbits);
+EXTERN(boolean) jpeg_fill_bit_buffer(bitread_working_state *state,
+ register bit_buf_type get_buffer,
+ register int bits_left, int nbits);
/*
@@ -187,13 +194,14 @@ EXTERN(boolean) jpeg_fill_bit_buffer
* 3. jpeg_huff_decode returns -1 if forced to suspend.
*/
-#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
-{ register int nb, look; \
+#define HUFF_DECODE(result, state, htbl, failaction, slowlabel) { \
+ register int nb, look; \
if (bits_left < HUFF_LOOKAHEAD) { \
- if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
- get_buffer = state.get_buffer; bits_left = state.bits_left; \
+ if (!jpeg_fill_bit_buffer(&state, get_buffer, bits_left, 0)) \
+ { failaction; } \
+ get_buffer = state.get_buffer; bits_left = state.bits_left; \
if (bits_left < HUFF_LOOKAHEAD) { \
- nb = 1; goto slowlabel; \
+ nb = 1; goto slowlabel; \
} \
} \
look = PEEK_BITS(HUFF_LOOKAHEAD); \
@@ -202,13 +210,14 @@ EXTERN(boolean) jpeg_fill_bit_buffer
result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
} else { \
slowlabel: \
- if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
- { failaction; } \
- get_buffer = state.get_buffer; bits_left = state.bits_left; \
+ if ((result = \
+ jpeg_huff_decode(&state, get_buffer, bits_left, htbl, nb)) < 0) \
+ { failaction; } \
+ get_buffer = state.get_buffer; bits_left = state.bits_left; \
} \
}
-#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \
+#define HUFF_DECODE_FAST(s, nb, htbl) \
FILL_BIT_BUFFER_FAST; \
s = PEEK_BITS(HUFF_LOOKAHEAD); \
s = htbl->lookup[s]; \
@@ -226,11 +235,13 @@ slowlabel: \
nb++; \
} \
if (nb > 16) \
- goto slowlabel; \
- s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \
+ s = 0; \
+ else \
+ s = htbl->pub->huffval[(int)(s + htbl->valoffset[nb]) & 0xFF]; \
}
/* Out-of-line case for Huffman code fetching */
-EXTERN(int) jpeg_huff_decode
- (bitread_working_state *state, register bit_buf_type get_buffer,
- register int bits_left, d_derived_tbl *htbl, int min_bits);
+EXTERN(int) jpeg_huff_decode(bitread_working_state *state,
+ register bit_buf_type get_buffer,
+ register int bits_left, d_derived_tbl *htbl,
+ int min_bits);
diff --git a/media/libjpeg/jdicc.c b/media/libjpeg/jdicc.c
new file mode 100644
index 0000000000..50aa9a9676
--- /dev/null
+++ b/media/libjpeg/jdicc.c
@@ -0,0 +1,167 @@
+/*
+ * jdicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to read International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files. The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers. The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to get the profile data from a JPEG file while reading it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+#define ICC_MARKER (JPEG_APP0 + 2) /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN 14 /* size of non-profile data in APP2 */
+
+
+/*
+ * Handy subroutine to test whether a saved marker is an ICC profile marker.
+ */
+
+LOCAL(boolean)
+marker_is_icc(jpeg_saved_marker_ptr marker)
+{
+ return
+ marker->marker == ICC_MARKER &&
+ marker->data_length >= ICC_OVERHEAD_LEN &&
+ /* verify the identifying string */
+ marker->data[0] == 0x49 &&
+ marker->data[1] == 0x43 &&
+ marker->data[2] == 0x43 &&
+ marker->data[3] == 0x5F &&
+ marker->data[4] == 0x50 &&
+ marker->data[5] == 0x52 &&
+ marker->data[6] == 0x4F &&
+ marker->data[7] == 0x46 &&
+ marker->data[8] == 0x49 &&
+ marker->data[9] == 0x4C &&
+ marker->data[10] == 0x45 &&
+ marker->data[11] == 0x0;
+}
+
+
+/*
+ * See if there was an ICC profile in the JPEG file being read; if so,
+ * reassemble and return the profile data.
+ *
+ * TRUE is returned if an ICC profile was found, FALSE if not. If TRUE is
+ * returned, *icc_data_ptr is set to point to the returned data, and
+ * *icc_data_len is set to its length.
+ *
+ * IMPORTANT: the data at *icc_data_ptr is allocated with malloc() and must be
+ * freed by the caller with free() when the caller no longer needs it.
+ * (Alternatively, we could write this routine to use the IJG library's memory
+ * allocator, so that the data would be freed implicitly when
+ * jpeg_finish_decompress() is called. But it seems likely that many
+ * applications will prefer to have the data stick around after decompression
+ * finishes.)
+ */
+
+GLOBAL(boolean)
+jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+ unsigned int *icc_data_len)
+{
+ jpeg_saved_marker_ptr marker;
+ int num_markers = 0;
+ int seq_no;
+ JOCTET *icc_data;
+ unsigned int total_length;
+#define MAX_SEQ_NO 255 /* sufficient since marker numbers are bytes */
+ char marker_present[MAX_SEQ_NO + 1]; /* 1 if marker found */
+ unsigned int data_length[MAX_SEQ_NO + 1]; /* size of profile data in marker */
+ unsigned int data_offset[MAX_SEQ_NO + 1]; /* offset for data in marker */
+
+ if (icc_data_ptr == NULL || icc_data_len == NULL)
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+ if (cinfo->global_state < DSTATE_READY)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ *icc_data_ptr = NULL; /* avoid confusion if FALSE return */
+ *icc_data_len = 0;
+
+ /* This first pass over the saved markers discovers whether there are
+ * any ICC markers and verifies the consistency of the marker numbering.
+ */
+
+ for (seq_no = 1; seq_no <= MAX_SEQ_NO; seq_no++)
+ marker_present[seq_no] = 0;
+
+ for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+ if (marker_is_icc(marker)) {
+ if (num_markers == 0)
+ num_markers = marker->data[13];
+ else if (num_markers != marker->data[13]) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* inconsistent num_markers fields */
+ return FALSE;
+ }
+ seq_no = marker->data[12];
+ if (seq_no <= 0 || seq_no > num_markers) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* bogus sequence number */
+ return FALSE;
+ }
+ if (marker_present[seq_no]) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* duplicate sequence numbers */
+ return FALSE;
+ }
+ marker_present[seq_no] = 1;
+ data_length[seq_no] = marker->data_length - ICC_OVERHEAD_LEN;
+ }
+ }
+
+ if (num_markers == 0)
+ return FALSE;
+
+ /* Check for missing markers, count total space needed,
+ * compute offset of each marker's part of the data.
+ */
+
+ total_length = 0;
+ for (seq_no = 1; seq_no <= num_markers; seq_no++) {
+ if (marker_present[seq_no] == 0) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* missing sequence number */
+ return FALSE;
+ }
+ data_offset[seq_no] = total_length;
+ total_length += data_length[seq_no];
+ }
+
+ if (total_length == 0) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* found only empty markers? */
+ return FALSE;
+ }
+
+ /* Allocate space for assembled data */
+ icc_data = (JOCTET *)malloc(total_length * sizeof(JOCTET));
+ if (icc_data == NULL)
+ ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 11); /* oops, out of memory */
+
+ /* and fill it in */
+ for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+ if (marker_is_icc(marker)) {
+ JOCTET FAR *src_ptr;
+ JOCTET *dst_ptr;
+ unsigned int length;
+ seq_no = marker->data[12];
+ dst_ptr = icc_data + data_offset[seq_no];
+ src_ptr = marker->data + ICC_OVERHEAD_LEN;
+ length = data_length[seq_no];
+ while (length--) {
+ *dst_ptr++ = *src_ptr++;
+ }
+ }
+ }
+
+ *icc_data_ptr = icc_data;
+ *icc_data_len = total_length;
+
+ return TRUE;
+}
diff --git a/media/libjpeg/jdinput.c b/media/libjpeg/jdinput.c
index 32a6b424e2..1bc5aff1a7 100644
--- a/media/libjpeg/jdinput.c
+++ b/media/libjpeg/jdinput.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2016, D. R. Commander.
+ * Copyright (C) 2010, 2016, 2018, 2022, D. R. Commander.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
@@ -33,7 +33,7 @@ typedef my_input_controller *my_inputctl_ptr;
/* Forward declarations */
-METHODDEF(int) consume_markers (j_decompress_ptr cinfo);
+METHODDEF(int) consume_markers(j_decompress_ptr cinfo);
/*
@@ -41,16 +41,16 @@ METHODDEF(int) consume_markers (j_decompress_ptr cinfo);
*/
LOCAL(void)
-initial_setup (j_decompress_ptr cinfo)
+initial_setup(j_decompress_ptr cinfo)
/* Called once, when first SOS marker is reached */
{
int ci;
jpeg_component_info *compptr;
/* Make sure image isn't bigger than I can handle */
- if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
- (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
- ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+ if ((long)cinfo->image_height > (long)JPEG_MAX_DIMENSION ||
+ (long)cinfo->image_width > (long)JPEG_MAX_DIMENSION)
+ ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
/* For now, precision must match compiled-in value... */
if (cinfo->data_precision != BITS_IN_JSAMPLE)
@@ -66,8 +66,10 @@ initial_setup (j_decompress_ptr cinfo)
cinfo->max_v_samp_factor = 1;
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
- compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+ if (compptr->h_samp_factor <= 0 ||
+ compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+ compptr->v_samp_factor <= 0 ||
+ compptr->v_samp_factor > MAX_SAMP_FACTOR)
ERREXIT(cinfo, JERR_BAD_SAMPLING);
cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
compptr->h_samp_factor);
@@ -75,10 +77,10 @@ initial_setup (j_decompress_ptr cinfo)
compptr->v_samp_factor);
}
-#if JPEG_LIB_VERSION >=80
- cinfo->block_size = DCTSIZE;
- cinfo->natural_order = jpeg_natural_order;
- cinfo->lim_Se = DCTSIZE2-1;
+#if JPEG_LIB_VERSION >= 80
+ cinfo->block_size = DCTSIZE;
+ cinfo->natural_order = jpeg_natural_order;
+ cinfo->lim_Se = DCTSIZE2 - 1;
#endif
/* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
@@ -101,11 +103,11 @@ initial_setup (j_decompress_ptr cinfo)
#endif
/* Size in DCT blocks */
compptr->width_in_blocks = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
- (long) (cinfo->max_h_samp_factor * DCTSIZE));
+ jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
compptr->height_in_blocks = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
- (long) (cinfo->max_v_samp_factor * DCTSIZE));
+ jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
/* Set the first and last MCU columns to decompress from multi-scan images.
* By default, decompress all of the MCU columns.
*/
@@ -117,11 +119,11 @@ initial_setup (j_decompress_ptr cinfo)
*/
/* Size in samples */
compptr->downsampled_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
- (long) cinfo->max_h_samp_factor);
+ jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+ (long)cinfo->max_h_samp_factor);
compptr->downsampled_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
- (long) cinfo->max_v_samp_factor);
+ jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+ (long)cinfo->max_v_samp_factor);
/* Mark component needed, until color conversion says otherwise */
compptr->component_needed = TRUE;
/* Mark no quantization table yet saved for component */
@@ -130,8 +132,8 @@ initial_setup (j_decompress_ptr cinfo)
/* Compute number of fully interleaved MCU rows. */
cinfo->total_iMCU_rows = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height,
- (long) (cinfo->max_v_samp_factor*DCTSIZE));
+ jdiv_round_up((long)cinfo->image_height,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
/* Decide whether file contains multiple scans */
if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
@@ -142,7 +144,7 @@ initial_setup (j_decompress_ptr cinfo)
LOCAL(void)
-per_scan_setup (j_decompress_ptr cinfo)
+per_scan_setup(j_decompress_ptr cinfo)
/* Do computations that are needed before processing a JPEG scan */
/* cinfo->comps_in_scan and cinfo->cur_comp_info[] were set from SOS marker */
{
@@ -167,7 +169,7 @@ per_scan_setup (j_decompress_ptr cinfo)
/* For noninterleaved scans, it is convenient to define last_row_height
* as the number of block rows present in the last iMCU row.
*/
- tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+ tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
if (tmp == 0) tmp = compptr->v_samp_factor;
compptr->last_row_height = tmp;
@@ -184,11 +186,11 @@ per_scan_setup (j_decompress_ptr cinfo)
/* Overall image size in MCUs */
cinfo->MCUs_per_row = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width,
- (long) (cinfo->max_h_samp_factor*DCTSIZE));
+ jdiv_round_up((long)cinfo->image_width,
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
cinfo->MCU_rows_in_scan = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height,
- (long) (cinfo->max_v_samp_factor*DCTSIZE));
+ jdiv_round_up((long)cinfo->image_height,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
cinfo->blocks_in_MCU = 0;
@@ -198,12 +200,13 @@ per_scan_setup (j_decompress_ptr cinfo)
compptr->MCU_width = compptr->h_samp_factor;
compptr->MCU_height = compptr->v_samp_factor;
compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
- compptr->MCU_sample_width = compptr->MCU_width * compptr->_DCT_scaled_size;
+ compptr->MCU_sample_width = compptr->MCU_width *
+ compptr->_DCT_scaled_size;
/* Figure number of non-dummy blocks in last MCU column & row */
- tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
+ tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
if (tmp == 0) tmp = compptr->MCU_width;
compptr->last_col_width = tmp;
- tmp = (int) (compptr->height_in_blocks % compptr->MCU_height);
+ tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
if (tmp == 0) tmp = compptr->MCU_height;
compptr->last_row_height = tmp;
/* Prepare array describing MCU composition */
@@ -231,17 +234,17 @@ per_scan_setup (j_decompress_ptr cinfo)
* means that we have to save away the table actually used for each component.
* We do this by copying the table at the start of the first scan containing
* the component.
- * The JPEG spec prohibits the encoder from changing the contents of a Q-table
- * slot between scans of a component using that slot. If the encoder does so
- * anyway, this decoder will simply use the Q-table values that were current
- * at the start of the first scan for the component.
+ * Rec. ITU-T T.81 | ISO/IEC 10918-1 prohibits the encoder from changing the
+ * contents of a Q-table slot between scans of a component using that slot. If
+ * the encoder does so anyway, this decoder will simply use the Q-table values
+ * that were current at the start of the first scan for the component.
*
* The decompressor output side looks only at the saved quant tables,
* not at the current Q-table slots.
*/
LOCAL(void)
-latch_quant_tables (j_decompress_ptr cinfo)
+latch_quant_tables(j_decompress_ptr cinfo)
{
int ci, qtblno;
jpeg_component_info *compptr;
@@ -259,9 +262,9 @@ latch_quant_tables (j_decompress_ptr cinfo)
ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
/* OK, save away the quantization table */
qtbl = (JQUANT_TBL *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(JQUANT_TBL));
- MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
+ memcpy(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
compptr->quant_table = qtbl;
}
}
@@ -275,7 +278,7 @@ latch_quant_tables (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-start_input_pass (j_decompress_ptr cinfo)
+start_input_pass(j_decompress_ptr cinfo)
{
per_scan_setup(cinfo);
latch_quant_tables(cinfo);
@@ -292,7 +295,7 @@ start_input_pass (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-finish_input_pass (j_decompress_ptr cinfo)
+finish_input_pass(j_decompress_ptr cinfo)
{
cinfo->inputctl->consume_input = consume_markers;
}
@@ -309,9 +312,9 @@ finish_input_pass (j_decompress_ptr cinfo)
*/
METHODDEF(int)
-consume_markers (j_decompress_ptr cinfo)
+consume_markers(j_decompress_ptr cinfo)
{
- my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl;
+ my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
int val;
if (inputctl->pub.eoi_reached) /* After hitting EOI, read no further */
@@ -329,7 +332,7 @@ consume_markers (j_decompress_ptr cinfo)
* responsible for enforcing this sequencing.
*/
} else { /* 2nd or later SOS marker */
- if (! inputctl->pub.has_multiple_scans)
+ if (!inputctl->pub.has_multiple_scans)
ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
start_input_pass(cinfo);
}
@@ -360,16 +363,16 @@ consume_markers (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-reset_input_controller (j_decompress_ptr cinfo)
+reset_input_controller(j_decompress_ptr cinfo)
{
- my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl;
+ my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
inputctl->pub.consume_input = consume_markers;
inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
inputctl->pub.eoi_reached = FALSE;
inputctl->inheaders = TRUE;
/* Reset other modules */
- (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+ (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
(*cinfo->marker->reset_marker_reader) (cinfo);
/* Reset progression state -- would be cleaner if entropy decoder did this */
cinfo->coef_bits = NULL;
@@ -382,15 +385,15 @@ reset_input_controller (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jinit_input_controller (j_decompress_ptr cinfo)
+jinit_input_controller(j_decompress_ptr cinfo)
{
my_inputctl_ptr inputctl;
/* Create subobject in permanent pool */
inputctl = (my_inputctl_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
sizeof(my_input_controller));
- cinfo->inputctl = (struct jpeg_input_controller *) inputctl;
+ cinfo->inputctl = (struct jpeg_input_controller *)inputctl;
/* Initialize method pointers */
inputctl->pub.consume_input = consume_markers;
inputctl->pub.reset_input_controller = reset_input_controller;
diff --git a/media/libjpeg/jdmainct.c b/media/libjpeg/jdmainct.c
index ebb069b0f4..f466b259f0 100644
--- a/media/libjpeg/jdmainct.c
+++ b/media/libjpeg/jdmainct.c
@@ -18,6 +18,7 @@
#include "jinclude.h"
#include "jdmainct.h"
+#include "jconfigint.h"
/*
@@ -112,26 +113,29 @@
/* Forward declarations */
-METHODDEF(void) process_data_simple_main
- (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
- JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
-METHODDEF(void) process_data_context_main
- (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
- JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_simple_main(j_decompress_ptr cinfo,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_context_main(j_decompress_ptr cinfo,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
#ifdef QUANT_2PASS_SUPPORTED
-METHODDEF(void) process_data_crank_post
- (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
- JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_crank_post(j_decompress_ptr cinfo,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
#endif
LOCAL(void)
-alloc_funny_pointers (j_decompress_ptr cinfo)
+alloc_funny_pointers(j_decompress_ptr cinfo)
/* Allocate space for the funny pointer lists.
* This is done only once, not once per pass.
*/
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
int ci, rgroup;
int M = cinfo->_min_DCT_scaled_size;
jpeg_component_info *compptr;
@@ -141,7 +145,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
* We alloc both arrays with one call to save a few cycles.
*/
main_ptr->xbuffer[0] = (JSAMPIMAGE)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
cinfo->num_components * 2 * sizeof(JSAMPARRAY));
main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components;
@@ -153,7 +157,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
* We alloc both pointer lists with one call to save a few cycles.
*/
xbuf = (JSAMPARRAY)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
2 * (rgroup * (M + 4)) * sizeof(JSAMPROW));
xbuf += rgroup; /* want one row group at negative offsets */
main_ptr->xbuffer[0][ci] = xbuf;
@@ -164,7 +168,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
LOCAL(void)
-make_funny_pointers (j_decompress_ptr cinfo)
+make_funny_pointers(j_decompress_ptr cinfo)
/* Create the funny pointer lists discussed in the comments above.
* The actual workspace is already allocated (in main_ptr->buffer),
* and the space for the pointer lists is allocated too.
@@ -172,7 +176,7 @@ make_funny_pointers (j_decompress_ptr cinfo)
* This will be repeated at the beginning of each pass.
*/
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
int ci, i, rgroup;
int M = cinfo->_min_DCT_scaled_size;
jpeg_component_info *compptr;
@@ -191,8 +195,8 @@ make_funny_pointers (j_decompress_ptr cinfo)
}
/* In the second list, put the last four row groups in swapped order */
for (i = 0; i < rgroup * 2; i++) {
- xbuf1[rgroup*(M-2) + i] = buf[rgroup*M + i];
- xbuf1[rgroup*M + i] = buf[rgroup*(M-2) + i];
+ xbuf1[rgroup * (M - 2) + i] = buf[rgroup * M + i];
+ xbuf1[rgroup * M + i] = buf[rgroup * (M - 2) + i];
}
/* The wraparound pointers at top and bottom will be filled later
* (see set_wraparound_pointers, below). Initially we want the "above"
@@ -207,13 +211,13 @@ make_funny_pointers (j_decompress_ptr cinfo)
LOCAL(void)
-set_bottom_pointers (j_decompress_ptr cinfo)
+set_bottom_pointers(j_decompress_ptr cinfo)
/* Change the pointer lists to duplicate the last sample row at the bottom
* of the image. whichptr indicates which xbuffer holds the final iMCU row.
* Also sets rowgroups_avail to indicate number of nondummy row groups in row.
*/
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
int ci, i, rgroup, iMCUheight, rows_left;
jpeg_component_info *compptr;
JSAMPARRAY xbuf;
@@ -224,20 +228,20 @@ set_bottom_pointers (j_decompress_ptr cinfo)
iMCUheight = compptr->v_samp_factor * compptr->_DCT_scaled_size;
rgroup = iMCUheight / cinfo->_min_DCT_scaled_size;
/* Count nondummy sample rows remaining for this component */
- rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
+ rows_left = (int)(compptr->downsampled_height % (JDIMENSION)iMCUheight);
if (rows_left == 0) rows_left = iMCUheight;
/* Count nondummy row groups. Should get same answer for each component,
* so we need only do it once.
*/
if (ci == 0) {
- main_ptr->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
+ main_ptr->rowgroups_avail = (JDIMENSION)((rows_left - 1) / rgroup + 1);
}
/* Duplicate the last real sample row rgroup*2 times; this pads out the
* last partial rowgroup and ensures at least one full rowgroup of context.
*/
xbuf = main_ptr->xbuffer[main_ptr->whichptr][ci];
for (i = 0; i < rgroup * 2; i++) {
- xbuf[rows_left + i] = xbuf[rows_left-1];
+ xbuf[rows_left + i] = xbuf[rows_left - 1];
}
}
}
@@ -248,9 +252,9 @@ set_bottom_pointers (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_main(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
switch (pass_mode) {
case JBUF_PASS_THRU:
@@ -286,22 +290,21 @@ start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
*/
METHODDEF(void)
-process_data_simple_main (j_decompress_ptr cinfo,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail)
+process_data_simple_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
JDIMENSION rowgroups_avail;
/* Read input data if we haven't filled the main buffer yet */
- if (! main_ptr->buffer_full) {
- if (! (*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
+ if (!main_ptr->buffer_full) {
+ if (!(*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
return; /* suspension forced, can do nothing more */
main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */
}
/* There are always min_DCT_scaled_size row groups in an iMCU row. */
- rowgroups_avail = (JDIMENSION) cinfo->_min_DCT_scaled_size;
+ rowgroups_avail = (JDIMENSION)cinfo->_min_DCT_scaled_size;
/* Note: at the bottom of the image, we may pass extra garbage row groups
* to the postprocessor. The postprocessor has to check for bottom
* of image anyway (at row resolution), so no point in us doing it too.
@@ -326,16 +329,15 @@ process_data_simple_main (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-process_data_context_main (j_decompress_ptr cinfo,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail)
+process_data_context_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
/* Read input data if we haven't filled the main buffer yet */
- if (! main_ptr->buffer_full) {
- if (! (*cinfo->coef->decompress_data) (cinfo,
- main_ptr->xbuffer[main_ptr->whichptr]))
+ if (!main_ptr->buffer_full) {
+ if (!(*cinfo->coef->decompress_data) (cinfo,
+ main_ptr->xbuffer[main_ptr->whichptr]))
return; /* suspension forced, can do nothing more */
main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */
main_ptr->iMCU_row_ctr++; /* count rows received */
@@ -349,31 +351,35 @@ process_data_context_main (j_decompress_ptr cinfo,
switch (main_ptr->context_state) {
case CTX_POSTPONED_ROW:
/* Call postprocessor using previously set pointers for postponed row */
- (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
- &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
- output_buf, out_row_ctr, out_rows_avail);
+ (*cinfo->post->post_process_data) (cinfo,
+ main_ptr->xbuffer[main_ptr->whichptr],
+ &main_ptr->rowgroup_ctr,
+ main_ptr->rowgroups_avail, output_buf,
+ out_row_ctr, out_rows_avail);
if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
return; /* Need to suspend */
main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
if (*out_row_ctr >= out_rows_avail)
return; /* Postprocessor exactly filled output buf */
- /*FALLTHROUGH*/
+ FALLTHROUGH /*FALLTHROUGH*/
case CTX_PREPARE_FOR_IMCU:
/* Prepare to process first M-1 row groups of this iMCU row */
main_ptr->rowgroup_ctr = 0;
- main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size - 1);
+ main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size - 1);
/* Check for bottom of image: if so, tweak pointers to "duplicate"
* the last sample row, and adjust rowgroups_avail to ignore padding rows.
*/
if (main_ptr->iMCU_row_ctr == cinfo->total_iMCU_rows)
set_bottom_pointers(cinfo);
main_ptr->context_state = CTX_PROCESS_IMCU;
- /*FALLTHROUGH*/
+ FALLTHROUGH /*FALLTHROUGH*/
case CTX_PROCESS_IMCU:
/* Call postprocessor using previously set pointers */
- (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
- &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
- output_buf, out_row_ctr, out_rows_avail);
+ (*cinfo->post->post_process_data) (cinfo,
+ main_ptr->xbuffer[main_ptr->whichptr],
+ &main_ptr->rowgroup_ctr,
+ main_ptr->rowgroups_avail, output_buf,
+ out_row_ctr, out_rows_avail);
if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
return; /* Need to suspend */
/* After the first iMCU, change wraparound pointers to normal state */
@@ -384,8 +390,8 @@ process_data_context_main (j_decompress_ptr cinfo,
main_ptr->buffer_full = FALSE;
/* Still need to process last row group of this iMCU row, */
/* which is saved at index M+1 of the other xbuffer */
- main_ptr->rowgroup_ctr = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 1);
- main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 2);
+ main_ptr->rowgroup_ctr = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 1);
+ main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 2);
main_ptr->context_state = CTX_POSTPONED_ROW;
}
}
@@ -400,12 +406,11 @@ process_data_context_main (j_decompress_ptr cinfo,
#ifdef QUANT_2PASS_SUPPORTED
METHODDEF(void)
-process_data_crank_post (j_decompress_ptr cinfo,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail)
+process_data_crank_post(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
{
- (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE) NULL,
- (JDIMENSION *) NULL, (JDIMENSION) 0,
+ (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE)NULL,
+ (JDIMENSION *)NULL, (JDIMENSION)0,
output_buf, out_row_ctr, out_rows_avail);
}
@@ -417,16 +422,16 @@ process_data_crank_post (j_decompress_ptr cinfo,
*/
GLOBAL(void)
-jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_main_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
{
my_main_ptr main_ptr;
int ci, rgroup, ngroups;
jpeg_component_info *compptr;
main_ptr = (my_main_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_main_controller));
- cinfo->main = (struct jpeg_d_main_controller *) main_ptr;
+ cinfo->main = (struct jpeg_d_main_controller *)main_ptr;
main_ptr->pub.start_pass = start_pass_main;
if (need_full_buffer) /* shouldn't happen */
@@ -449,8 +454,8 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
cinfo->_min_DCT_scaled_size; /* height of a row group of component */
main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
compptr->width_in_blocks * compptr->_DCT_scaled_size,
- (JDIMENSION) (rgroup * ngroups));
+ (JDIMENSION)(rgroup * ngroups));
}
}
diff --git a/media/libjpeg/jdmainct.h b/media/libjpeg/jdmainct.h
index 30903019ca..37b201ca88 100644
--- a/media/libjpeg/jdmainct.h
+++ b/media/libjpeg/jdmainct.h
@@ -44,12 +44,12 @@ typedef my_main_controller *my_main_ptr;
LOCAL(void)
-set_wraparound_pointers (j_decompress_ptr cinfo)
+set_wraparound_pointers(j_decompress_ptr cinfo)
/* Set up the "wraparound" pointers at top and bottom of the pointer lists.
* This changes the pointer list state from top-of-image to the normal state.
*/
{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
int ci, i, rgroup;
int M = cinfo->_min_DCT_scaled_size;
jpeg_component_info *compptr;
@@ -62,10 +62,10 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
xbuf0 = main_ptr->xbuffer[0][ci];
xbuf1 = main_ptr->xbuffer[1][ci];
for (i = 0; i < rgroup; i++) {
- xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
- xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
- xbuf0[rgroup*(M+2) + i] = xbuf0[i];
- xbuf1[rgroup*(M+2) + i] = xbuf1[i];
+ xbuf0[i - rgroup] = xbuf0[rgroup * (M + 1) + i];
+ xbuf1[i - rgroup] = xbuf1[rgroup * (M + 1) + i];
+ xbuf0[rgroup * (M + 2) + i] = xbuf0[i];
+ xbuf1[rgroup * (M + 2) + i] = xbuf1[i];
}
}
}
diff --git a/media/libjpeg/jdmarker.c b/media/libjpeg/jdmarker.c
index e3b612c9b9..f7eba615fd 100644
--- a/media/libjpeg/jdmarker.c
+++ b/media/libjpeg/jdmarker.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2012, 2015, D. R. Commander.
+ * Copyright (C) 2012, 2015, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -119,50 +119,50 @@ typedef my_marker_reader *my_marker_ptr;
*/
/* Declare and initialize local copies of input pointer/count */
-#define INPUT_VARS(cinfo) \
- struct jpeg_source_mgr *datasrc = (cinfo)->src; \
- const JOCTET *next_input_byte = datasrc->next_input_byte; \
- size_t bytes_in_buffer = datasrc->bytes_in_buffer
+#define INPUT_VARS(cinfo) \
+ struct jpeg_source_mgr *datasrc = (cinfo)->src; \
+ const JOCTET *next_input_byte = datasrc->next_input_byte; \
+ size_t bytes_in_buffer = datasrc->bytes_in_buffer
/* Unload the local copies --- do this only at a restart boundary */
-#define INPUT_SYNC(cinfo) \
- ( datasrc->next_input_byte = next_input_byte, \
- datasrc->bytes_in_buffer = bytes_in_buffer )
+#define INPUT_SYNC(cinfo) \
+ ( datasrc->next_input_byte = next_input_byte, \
+ datasrc->bytes_in_buffer = bytes_in_buffer )
/* Reload the local copies --- used only in MAKE_BYTE_AVAIL */
-#define INPUT_RELOAD(cinfo) \
- ( next_input_byte = datasrc->next_input_byte, \
- bytes_in_buffer = datasrc->bytes_in_buffer )
+#define INPUT_RELOAD(cinfo) \
+ ( next_input_byte = datasrc->next_input_byte, \
+ bytes_in_buffer = datasrc->bytes_in_buffer )
/* Internal macro for INPUT_BYTE and INPUT_2BYTES: make a byte available.
* Note we do *not* do INPUT_SYNC before calling fill_input_buffer,
* but we must reload the local copies after a successful fill.
*/
-#define MAKE_BYTE_AVAIL(cinfo,action) \
- if (bytes_in_buffer == 0) { \
- if (! (*datasrc->fill_input_buffer) (cinfo)) \
- { action; } \
- INPUT_RELOAD(cinfo); \
- }
+#define MAKE_BYTE_AVAIL(cinfo, action) \
+ if (bytes_in_buffer == 0) { \
+ if (!(*datasrc->fill_input_buffer) (cinfo)) \
+ { action; } \
+ INPUT_RELOAD(cinfo); \
+ }
/* Read a byte into variable V.
* If must suspend, take the specified action (typically "return FALSE").
*/
-#define INPUT_BYTE(cinfo,V,action) \
- MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
- bytes_in_buffer--; \
- V = GETJOCTET(*next_input_byte++); )
+#define INPUT_BYTE(cinfo, V, action) \
+ MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+ bytes_in_buffer--; \
+ V = *next_input_byte++; )
/* As above, but read two bytes interpreted as an unsigned 16-bit integer.
* V should be declared unsigned int or perhaps JLONG.
*/
-#define INPUT_2BYTES(cinfo,V,action) \
- MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
- bytes_in_buffer--; \
- V = ((unsigned int) GETJOCTET(*next_input_byte++)) << 8; \
- MAKE_BYTE_AVAIL(cinfo,action); \
- bytes_in_buffer--; \
- V += GETJOCTET(*next_input_byte++); )
+#define INPUT_2BYTES(cinfo, V, action) \
+ MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+ bytes_in_buffer--; \
+ V = ((unsigned int)(*next_input_byte++)) << 8; \
+ MAKE_BYTE_AVAIL(cinfo, action); \
+ bytes_in_buffer--; \
+ V += *next_input_byte++; )
/*
@@ -197,7 +197,7 @@ typedef my_marker_reader *my_marker_ptr;
LOCAL(boolean)
-get_soi (j_decompress_ptr cinfo)
+get_soi(j_decompress_ptr cinfo)
/* Process an SOI marker */
{
int i;
@@ -237,7 +237,7 @@ get_soi (j_decompress_ptr cinfo)
LOCAL(boolean)
-get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
+get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
/* Process a SOFn marker */
{
JLONG length;
@@ -258,7 +258,7 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
length -= 8;
TRACEMS4(cinfo, 1, JTRC_SOF, cinfo->unread_marker,
- (int) cinfo->image_width, (int) cinfo->image_height,
+ (int)cinfo->image_width, (int)cinfo->image_height,
cinfo->num_components);
if (cinfo->marker->saw_SOF)
@@ -267,16 +267,16 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
/* We don't support files in which the image height is initially specified */
/* as 0 and is later redefined by DNL. As long as we have to check that, */
/* might as well have a general sanity check. */
- if (cinfo->image_height <= 0 || cinfo->image_width <= 0
- || cinfo->num_components <= 0)
+ if (cinfo->image_height <= 0 || cinfo->image_width <= 0 ||
+ cinfo->num_components <= 0)
ERREXIT(cinfo, JERR_EMPTY_IMAGE);
if (length != (cinfo->num_components * 3))
ERREXIT(cinfo, JERR_BAD_LENGTH);
if (cinfo->comp_info == NULL) /* do only once, even if suspend */
- cinfo->comp_info = (jpeg_component_info *) (*cinfo->mem->alloc_small)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ cinfo->comp_info = (jpeg_component_info *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
cinfo->num_components * sizeof(jpeg_component_info));
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -301,7 +301,7 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
LOCAL(boolean)
-get_sos (j_decompress_ptr cinfo)
+get_sos(j_decompress_ptr cinfo)
/* Process a SOS marker */
{
JLONG length;
@@ -309,7 +309,7 @@ get_sos (j_decompress_ptr cinfo)
jpeg_component_info *compptr;
INPUT_VARS(cinfo);
- if (! cinfo->marker->saw_SOF)
+ if (!cinfo->marker->saw_SOF)
ERREXIT(cinfo, JERR_SOS_NO_SOF);
INPUT_2BYTES(cinfo, length, return FALSE);
@@ -341,7 +341,7 @@ get_sos (j_decompress_ptr cinfo)
ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc);
- id_found:
+id_found:
cinfo->cur_comp_info[i] = compptr;
compptr->dc_tbl_no = (c >> 4) & 15;
@@ -384,7 +384,7 @@ get_sos (j_decompress_ptr cinfo)
#ifdef D_ARITH_CODING_SUPPORTED
LOCAL(boolean)
-get_dac (j_decompress_ptr cinfo)
+get_dac(j_decompress_ptr cinfo)
/* Process a DAC marker */
{
JLONG length;
@@ -402,14 +402,14 @@ get_dac (j_decompress_ptr cinfo)
TRACEMS2(cinfo, 1, JTRC_DAC, index, val);
- if (index < 0 || index >= (2*NUM_ARITH_TBLS))
+ if (index < 0 || index >= (2 * NUM_ARITH_TBLS))
ERREXIT1(cinfo, JERR_DAC_INDEX, index);
if (index >= NUM_ARITH_TBLS) { /* define AC table */
- cinfo->arith_ac_K[index-NUM_ARITH_TBLS] = (UINT8) val;
+ cinfo->arith_ac_K[index - NUM_ARITH_TBLS] = (UINT8)val;
} else { /* define DC table */
- cinfo->arith_dc_L[index] = (UINT8) (val & 0x0F);
- cinfo->arith_dc_U[index] = (UINT8) (val >> 4);
+ cinfo->arith_dc_L[index] = (UINT8)(val & 0x0F);
+ cinfo->arith_dc_U[index] = (UINT8)(val >> 4);
if (cinfo->arith_dc_L[index] > cinfo->arith_dc_U[index])
ERREXIT1(cinfo, JERR_DAC_VALUE, val);
}
@@ -422,7 +422,7 @@ get_dac (j_decompress_ptr cinfo)
return TRUE;
}
-#else /* ! D_ARITH_CODING_SUPPORTED */
+#else /* !D_ARITH_CODING_SUPPORTED */
#define get_dac(cinfo) skip_variable(cinfo)
@@ -430,7 +430,7 @@ get_dac (j_decompress_ptr cinfo)
LOCAL(boolean)
-get_dht (j_decompress_ptr cinfo)
+get_dht(j_decompress_ptr cinfo)
/* Process a DHT marker */
{
JLONG length;
@@ -467,13 +467,13 @@ get_dht (j_decompress_ptr cinfo)
/* Here we just do minimal validation of the counts to avoid walking
* off the end of our table space. jdhuff.c will check more carefully.
*/
- if (count > 256 || ((JLONG) count) > length)
+ if (count > 256 || ((JLONG)count) > length)
ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
for (i = 0; i < count; i++)
INPUT_BYTE(cinfo, huffval[i], return FALSE);
- MEMZERO(&huffval[count], (256 - count) * sizeof(UINT8));
+ memset(&huffval[count], 0, (256 - count) * sizeof(UINT8));
length -= count;
@@ -489,10 +489,10 @@ get_dht (j_decompress_ptr cinfo)
}
if (*htblptr == NULL)
- *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
- MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
- MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
+ memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+ memcpy((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
}
if (length != 0)
@@ -504,7 +504,7 @@ get_dht (j_decompress_ptr cinfo)
LOCAL(boolean)
-get_dqt (j_decompress_ptr cinfo)
+get_dqt(j_decompress_ptr cinfo)
/* Process a DQT marker */
{
JLONG length;
@@ -527,7 +527,7 @@ get_dqt (j_decompress_ptr cinfo)
ERREXIT1(cinfo, JERR_DQT_INDEX, n);
if (cinfo->quant_tbl_ptrs[n] == NULL)
- cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo);
+ cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr)cinfo);
quant_ptr = cinfo->quant_tbl_ptrs[n];
for (i = 0; i < DCTSIZE2; i++) {
@@ -536,20 +536,20 @@ get_dqt (j_decompress_ptr cinfo)
else
INPUT_BYTE(cinfo, tmp, return FALSE);
/* We convert the zigzag-order table to natural array order. */
- quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp;
+ quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16)tmp;
}
if (cinfo->err->trace_level >= 2) {
for (i = 0; i < DCTSIZE2; i += 8) {
TRACEMS8(cinfo, 2, JTRC_QUANTVALS,
- quant_ptr->quantval[i], quant_ptr->quantval[i+1],
- quant_ptr->quantval[i+2], quant_ptr->quantval[i+3],
- quant_ptr->quantval[i+4], quant_ptr->quantval[i+5],
- quant_ptr->quantval[i+6], quant_ptr->quantval[i+7]);
+ quant_ptr->quantval[i], quant_ptr->quantval[i + 1],
+ quant_ptr->quantval[i + 2], quant_ptr->quantval[i + 3],
+ quant_ptr->quantval[i + 4], quant_ptr->quantval[i + 5],
+ quant_ptr->quantval[i + 6], quant_ptr->quantval[i + 7]);
}
}
- length -= DCTSIZE2+1;
+ length -= DCTSIZE2 + 1;
if (prec) length -= DCTSIZE2;
}
@@ -562,7 +562,7 @@ get_dqt (j_decompress_ptr cinfo)
LOCAL(boolean)
-get_dri (j_decompress_ptr cinfo)
+get_dri(j_decompress_ptr cinfo)
/* Process a DRI marker */
{
JLONG length;
@@ -598,28 +598,28 @@ get_dri (j_decompress_ptr cinfo)
LOCAL(void)
-examine_app0 (j_decompress_ptr cinfo, JOCTET *data,
- unsigned int datalen, JLONG remaining)
+examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+ JLONG remaining)
/* Examine first few bytes from an APP0.
* Take appropriate action if it is a JFIF marker.
* datalen is # of bytes at data[], remaining is length of rest of marker data.
*/
{
- JLONG totallen = (JLONG) datalen + remaining;
+ JLONG totallen = (JLONG)datalen + remaining;
if (datalen >= APP0_DATA_LEN &&
- GETJOCTET(data[0]) == 0x4A &&
- GETJOCTET(data[1]) == 0x46 &&
- GETJOCTET(data[2]) == 0x49 &&
- GETJOCTET(data[3]) == 0x46 &&
- GETJOCTET(data[4]) == 0) {
+ data[0] == 0x4A &&
+ data[1] == 0x46 &&
+ data[2] == 0x49 &&
+ data[3] == 0x46 &&
+ data[4] == 0) {
/* Found JFIF APP0 marker: save info */
cinfo->saw_JFIF_marker = TRUE;
- cinfo->JFIF_major_version = GETJOCTET(data[5]);
- cinfo->JFIF_minor_version = GETJOCTET(data[6]);
- cinfo->density_unit = GETJOCTET(data[7]);
- cinfo->X_density = (GETJOCTET(data[8]) << 8) + GETJOCTET(data[9]);
- cinfo->Y_density = (GETJOCTET(data[10]) << 8) + GETJOCTET(data[11]);
+ cinfo->JFIF_major_version = data[5];
+ cinfo->JFIF_minor_version = data[6];
+ cinfo->density_unit = data[7];
+ cinfo->X_density = (data[8] << 8) + data[9];
+ cinfo->Y_density = (data[10] << 8) + data[11];
/* Check version.
* Major version must be 1, anything else signals an incompatible change.
* (We used to treat this as an error, but now it's a nonfatal warning,
@@ -634,48 +634,45 @@ examine_app0 (j_decompress_ptr cinfo, JOCTET *data,
cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
/* Validate thumbnail dimensions and issue appropriate messages */
- if (GETJOCTET(data[12]) | GETJOCTET(data[13]))
- TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL,
- GETJOCTET(data[12]), GETJOCTET(data[13]));
+ if (data[12] | data[13])
+ TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, data[12], data[13]);
totallen -= APP0_DATA_LEN;
- if (totallen !=
- ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG) 3))
- TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int) totallen);
+ if (totallen != ((JLONG)data[12] * (JLONG)data[13] * (JLONG)3))
+ TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int)totallen);
} else if (datalen >= 6 &&
- GETJOCTET(data[0]) == 0x4A &&
- GETJOCTET(data[1]) == 0x46 &&
- GETJOCTET(data[2]) == 0x58 &&
- GETJOCTET(data[3]) == 0x58 &&
- GETJOCTET(data[4]) == 0) {
+ data[0] == 0x4A &&
+ data[1] == 0x46 &&
+ data[2] == 0x58 &&
+ data[3] == 0x58 &&
+ data[4] == 0) {
/* Found JFIF "JFXX" extension APP0 marker */
/* The library doesn't actually do anything with these,
* but we try to produce a helpful trace message.
*/
- switch (GETJOCTET(data[5])) {
+ switch (data[5]) {
case 0x10:
- TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int) totallen);
+ TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int)totallen);
break;
case 0x11:
- TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int) totallen);
+ TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int)totallen);
break;
case 0x13:
- TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int) totallen);
+ TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int)totallen);
break;
default:
- TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION,
- GETJOCTET(data[5]), (int) totallen);
+ TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, data[5], (int)totallen);
break;
}
} else {
/* Start of APP0 does not match "JFIF" or "JFXX", or too short */
- TRACEMS1(cinfo, 1, JTRC_APP0, (int) totallen);
+ TRACEMS1(cinfo, 1, JTRC_APP0, (int)totallen);
}
}
LOCAL(void)
-examine_app14 (j_decompress_ptr cinfo, JOCTET *data,
- unsigned int datalen, JLONG remaining)
+examine_app14(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+ JLONG remaining)
/* Examine first few bytes from an APP14.
* Take appropriate action if it is an Adobe marker.
* datalen is # of bytes at data[], remaining is length of rest of marker data.
@@ -684,28 +681,28 @@ examine_app14 (j_decompress_ptr cinfo, JOCTET *data,
unsigned int version, flags0, flags1, transform;
if (datalen >= APP14_DATA_LEN &&
- GETJOCTET(data[0]) == 0x41 &&
- GETJOCTET(data[1]) == 0x64 &&
- GETJOCTET(data[2]) == 0x6F &&
- GETJOCTET(data[3]) == 0x62 &&
- GETJOCTET(data[4]) == 0x65) {
+ data[0] == 0x41 &&
+ data[1] == 0x64 &&
+ data[2] == 0x6F &&
+ data[3] == 0x62 &&
+ data[4] == 0x65) {
/* Found Adobe APP14 marker */
- version = (GETJOCTET(data[5]) << 8) + GETJOCTET(data[6]);
- flags0 = (GETJOCTET(data[7]) << 8) + GETJOCTET(data[8]);
- flags1 = (GETJOCTET(data[9]) << 8) + GETJOCTET(data[10]);
- transform = GETJOCTET(data[11]);
+ version = (data[5] << 8) + data[6];
+ flags0 = (data[7] << 8) + data[8];
+ flags1 = (data[9] << 8) + data[10];
+ transform = data[11];
TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform);
cinfo->saw_Adobe_marker = TRUE;
- cinfo->Adobe_transform = (UINT8) transform;
+ cinfo->Adobe_transform = (UINT8)transform;
} else {
/* Start of APP14 does not match "Adobe", or too short */
- TRACEMS1(cinfo, 1, JTRC_APP14, (int) (datalen + remaining));
+ TRACEMS1(cinfo, 1, JTRC_APP14, (int)(datalen + remaining));
}
}
METHODDEF(boolean)
-get_interesting_appn (j_decompress_ptr cinfo)
+get_interesting_appn(j_decompress_ptr cinfo)
/* Process an APP0 or APP14 marker without saving it */
{
JLONG length;
@@ -720,7 +717,7 @@ get_interesting_appn (j_decompress_ptr cinfo)
if (length >= APPN_DATA_LEN)
numtoread = APPN_DATA_LEN;
else if (length > 0)
- numtoread = (unsigned int) length;
+ numtoread = (unsigned int)length;
else
numtoread = 0;
for (i = 0; i < numtoread; i++)
@@ -730,10 +727,10 @@ get_interesting_appn (j_decompress_ptr cinfo)
/* process it */
switch (cinfo->unread_marker) {
case M_APP0:
- examine_app0(cinfo, (JOCTET *) b, numtoread, length);
+ examine_app0(cinfo, (JOCTET *)b, numtoread, length);
break;
case M_APP14:
- examine_app14(cinfo, (JOCTET *) b, numtoread, length);
+ examine_app14(cinfo, (JOCTET *)b, numtoread, length);
break;
default:
/* can't get here unless jpeg_save_markers chooses wrong processor */
@@ -744,7 +741,7 @@ get_interesting_appn (j_decompress_ptr cinfo)
/* skip any remaining data -- could be lots */
INPUT_SYNC(cinfo);
if (length > 0)
- (*cinfo->src->skip_input_data) (cinfo, (long) length);
+ (*cinfo->src->skip_input_data) (cinfo, (long)length);
return TRUE;
}
@@ -753,10 +750,10 @@ get_interesting_appn (j_decompress_ptr cinfo)
#ifdef SAVE_MARKERS_SUPPORTED
METHODDEF(boolean)
-save_marker (j_decompress_ptr cinfo)
+save_marker(j_decompress_ptr cinfo)
/* Save an APPn or COM marker into the marker list */
{
- my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
jpeg_saved_marker_ptr cur_marker = marker->cur_marker;
unsigned int bytes_read, data_length;
JOCTET *data;
@@ -770,22 +767,22 @@ save_marker (j_decompress_ptr cinfo)
if (length >= 0) { /* watch out for bogus length word */
/* figure out how much we want to save */
unsigned int limit;
- if (cinfo->unread_marker == (int) M_COM)
+ if (cinfo->unread_marker == (int)M_COM)
limit = marker->length_limit_COM;
else
- limit = marker->length_limit_APPn[cinfo->unread_marker - (int) M_APP0];
- if ((unsigned int) length < limit)
- limit = (unsigned int) length;
+ limit = marker->length_limit_APPn[cinfo->unread_marker - (int)M_APP0];
+ if ((unsigned int)length < limit)
+ limit = (unsigned int)length;
/* allocate and initialize the marker item */
cur_marker = (jpeg_saved_marker_ptr)
- (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(struct jpeg_marker_struct) + limit);
cur_marker->next = NULL;
- cur_marker->marker = (UINT8) cinfo->unread_marker;
- cur_marker->original_length = (unsigned int) length;
+ cur_marker->marker = (UINT8)cinfo->unread_marker;
+ cur_marker->original_length = (unsigned int)length;
cur_marker->data_length = limit;
/* data area is just beyond the jpeg_marker_struct */
- data = cur_marker->data = (JOCTET *) (cur_marker + 1);
+ data = cur_marker->data = (JOCTET *)(cur_marker + 1);
marker->cur_marker = cur_marker;
marker->bytes_read = 0;
bytes_read = 0;
@@ -843,14 +840,14 @@ save_marker (j_decompress_ptr cinfo)
break;
default:
TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker,
- (int) (data_length + length));
+ (int)(data_length + length));
break;
}
/* skip any remaining data -- could be lots */
INPUT_SYNC(cinfo); /* do before skip_input_data */
if (length > 0)
- (*cinfo->src->skip_input_data) (cinfo, (long) length);
+ (*cinfo->src->skip_input_data) (cinfo, (long)length);
return TRUE;
}
@@ -859,7 +856,7 @@ save_marker (j_decompress_ptr cinfo)
METHODDEF(boolean)
-skip_variable (j_decompress_ptr cinfo)
+skip_variable(j_decompress_ptr cinfo)
/* Skip over an unknown or uninteresting variable-length marker */
{
JLONG length;
@@ -868,11 +865,11 @@ skip_variable (j_decompress_ptr cinfo)
INPUT_2BYTES(cinfo, length, return FALSE);
length -= 2;
- TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int) length);
+ TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int)length);
INPUT_SYNC(cinfo); /* do before skip_input_data */
if (length > 0)
- (*cinfo->src->skip_input_data) (cinfo, (long) length);
+ (*cinfo->src->skip_input_data) (cinfo, (long)length);
return TRUE;
}
@@ -888,7 +885,7 @@ skip_variable (j_decompress_ptr cinfo)
*/
LOCAL(boolean)
-next_marker (j_decompress_ptr cinfo)
+next_marker(j_decompress_ptr cinfo)
{
int c;
INPUT_VARS(cinfo);
@@ -935,7 +932,7 @@ next_marker (j_decompress_ptr cinfo)
LOCAL(boolean)
-first_marker (j_decompress_ptr cinfo)
+first_marker(j_decompress_ptr cinfo)
/* Like next_marker, but used to obtain the initial SOI marker. */
/* For this marker, we do not allow preceding garbage or fill; otherwise,
* we might well scan an entire input file before realizing it ain't JPEG.
@@ -948,7 +945,7 @@ first_marker (j_decompress_ptr cinfo)
INPUT_BYTE(cinfo, c, return FALSE);
INPUT_BYTE(cinfo, c2, return FALSE);
- if (c != 0xFF || c2 != (int) M_SOI)
+ if (c != 0xFF || c2 != (int)M_SOI)
ERREXIT2(cinfo, JERR_NO_SOI, c, c2);
cinfo->unread_marker = c2;
@@ -966,18 +963,18 @@ first_marker (j_decompress_ptr cinfo)
*/
METHODDEF(int)
-read_markers (j_decompress_ptr cinfo)
+read_markers(j_decompress_ptr cinfo)
{
/* Outer loop repeats once for each marker. */
for (;;) {
/* Collect the marker proper, unless we already did. */
/* NB: first_marker() enforces the requirement that SOI appear first. */
if (cinfo->unread_marker == 0) {
- if (! cinfo->marker->saw_SOI) {
- if (! first_marker(cinfo))
+ if (!cinfo->marker->saw_SOI) {
+ if (!first_marker(cinfo))
return JPEG_SUSPENDED;
} else {
- if (! next_marker(cinfo))
+ if (!next_marker(cinfo))
return JPEG_SUSPENDED;
}
}
@@ -987,28 +984,28 @@ read_markers (j_decompress_ptr cinfo)
*/
switch (cinfo->unread_marker) {
case M_SOI:
- if (! get_soi(cinfo))
+ if (!get_soi(cinfo))
return JPEG_SUSPENDED;
break;
case M_SOF0: /* Baseline */
case M_SOF1: /* Extended sequential, Huffman */
- if (! get_sof(cinfo, FALSE, FALSE))
+ if (!get_sof(cinfo, FALSE, FALSE))
return JPEG_SUSPENDED;
break;
case M_SOF2: /* Progressive, Huffman */
- if (! get_sof(cinfo, TRUE, FALSE))
+ if (!get_sof(cinfo, TRUE, FALSE))
return JPEG_SUSPENDED;
break;
case M_SOF9: /* Extended sequential, arithmetic */
- if (! get_sof(cinfo, FALSE, TRUE))
+ if (!get_sof(cinfo, FALSE, TRUE))
return JPEG_SUSPENDED;
break;
case M_SOF10: /* Progressive, arithmetic */
- if (! get_sof(cinfo, TRUE, TRUE))
+ if (!get_sof(cinfo, TRUE, TRUE))
return JPEG_SUSPENDED;
break;
@@ -1026,7 +1023,7 @@ read_markers (j_decompress_ptr cinfo)
break;
case M_SOS:
- if (! get_sos(cinfo))
+ if (!get_sos(cinfo))
return JPEG_SUSPENDED;
cinfo->unread_marker = 0; /* processed the marker */
return JPEG_REACHED_SOS;
@@ -1037,22 +1034,22 @@ read_markers (j_decompress_ptr cinfo)
return JPEG_REACHED_EOI;
case M_DAC:
- if (! get_dac(cinfo))
+ if (!get_dac(cinfo))
return JPEG_SUSPENDED;
break;
case M_DHT:
- if (! get_dht(cinfo))
+ if (!get_dht(cinfo))
return JPEG_SUSPENDED;
break;
case M_DQT:
- if (! get_dqt(cinfo))
+ if (!get_dqt(cinfo))
return JPEG_SUSPENDED;
break;
case M_DRI:
- if (! get_dri(cinfo))
+ if (!get_dri(cinfo))
return JPEG_SUSPENDED;
break;
@@ -1072,13 +1069,13 @@ read_markers (j_decompress_ptr cinfo)
case M_APP13:
case M_APP14:
case M_APP15:
- if (! (*((my_marker_ptr) cinfo->marker)->process_APPn[
- cinfo->unread_marker - (int) M_APP0]) (cinfo))
+ if (!(*((my_marker_ptr)cinfo->marker)->process_APPn[
+ cinfo->unread_marker - (int)M_APP0]) (cinfo))
return JPEG_SUSPENDED;
break;
case M_COM:
- if (! (*((my_marker_ptr) cinfo->marker)->process_COM) (cinfo))
+ if (!(*((my_marker_ptr)cinfo->marker)->process_COM) (cinfo))
return JPEG_SUSPENDED;
break;
@@ -1095,7 +1092,7 @@ read_markers (j_decompress_ptr cinfo)
break;
case M_DNL: /* Ignore DNL ... perhaps the wrong thing */
- if (! skip_variable(cinfo))
+ if (!skip_variable(cinfo))
return JPEG_SUSPENDED;
break;
@@ -1127,25 +1124,25 @@ read_markers (j_decompress_ptr cinfo)
*/
METHODDEF(boolean)
-read_restart_marker (j_decompress_ptr cinfo)
+read_restart_marker(j_decompress_ptr cinfo)
{
/* Obtain a marker unless we already did. */
/* Note that next_marker will complain if it skips any data. */
if (cinfo->unread_marker == 0) {
- if (! next_marker(cinfo))
+ if (!next_marker(cinfo))
return FALSE;
}
if (cinfo->unread_marker ==
- ((int) M_RST0 + cinfo->marker->next_restart_num)) {
+ ((int)M_RST0 + cinfo->marker->next_restart_num)) {
/* Normal case --- swallow the marker and let entropy decoder continue */
TRACEMS1(cinfo, 3, JTRC_RST, cinfo->marker->next_restart_num);
cinfo->unread_marker = 0;
} else {
/* Uh-oh, the restart markers have been messed up. */
/* Let the data source manager determine how to resync. */
- if (! (*cinfo->src->resync_to_restart) (cinfo,
- cinfo->marker->next_restart_num))
+ if (!(*cinfo->src->resync_to_restart) (cinfo,
+ cinfo->marker->next_restart_num))
return FALSE;
}
@@ -1206,7 +1203,7 @@ read_restart_marker (j_decompress_ptr cinfo)
*/
GLOBAL(boolean)
-jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
+jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired)
{
int marker = cinfo->unread_marker;
int action = 1;
@@ -1216,16 +1213,16 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
/* Outer loop handles repeated decision after scanning forward. */
for (;;) {
- if (marker < (int) M_SOF0)
+ if (marker < (int)M_SOF0)
action = 2; /* invalid marker */
- else if (marker < (int) M_RST0 || marker > (int) M_RST7)
+ else if (marker < (int)M_RST0 || marker > (int)M_RST7)
action = 3; /* valid non-restart marker */
else {
- if (marker == ((int) M_RST0 + ((desired+1) & 7)) ||
- marker == ((int) M_RST0 + ((desired+2) & 7)))
+ if (marker == ((int)M_RST0 + ((desired + 1) & 7)) ||
+ marker == ((int)M_RST0 + ((desired + 2) & 7)))
action = 3; /* one of the next two expected restarts */
- else if (marker == ((int) M_RST0 + ((desired-1) & 7)) ||
- marker == ((int) M_RST0 + ((desired-2) & 7)))
+ else if (marker == ((int)M_RST0 + ((desired - 1) & 7)) ||
+ marker == ((int)M_RST0 + ((desired - 2) & 7)))
action = 2; /* a prior restart, so advance */
else
action = 1; /* desired restart or too far away */
@@ -1238,7 +1235,7 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
return TRUE;
case 2:
/* Scan to the next marker, and repeat the decision loop. */
- if (! next_marker(cinfo))
+ if (!next_marker(cinfo))
return FALSE;
marker = cinfo->unread_marker;
break;
@@ -1256,9 +1253,9 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
*/
METHODDEF(void)
-reset_marker_reader (j_decompress_ptr cinfo)
+reset_marker_reader(j_decompress_ptr cinfo)
{
- my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
cinfo->comp_info = NULL; /* until allocated by get_sof */
cinfo->input_scan_number = 0; /* no SOS seen yet */
@@ -1276,16 +1273,16 @@ reset_marker_reader (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jinit_marker_reader (j_decompress_ptr cinfo)
+jinit_marker_reader(j_decompress_ptr cinfo)
{
my_marker_ptr marker;
int i;
/* Create subobject in permanent pool */
marker = (my_marker_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
sizeof(my_marker_reader));
- cinfo->marker = (struct jpeg_marker_reader *) marker;
+ cinfo->marker = (struct jpeg_marker_reader *)marker;
/* Initialize public method pointers */
marker->pub.reset_marker_reader = reset_marker_reader;
marker->pub.read_markers = read_markers;
@@ -1314,10 +1311,10 @@ jinit_marker_reader (j_decompress_ptr cinfo)
#ifdef SAVE_MARKERS_SUPPORTED
GLOBAL(void)
-jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
- unsigned int length_limit)
+jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+ unsigned int length_limit)
{
- my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
long maxlength;
jpeg_marker_parser_method processor;
@@ -1325,8 +1322,8 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
* (should only be a concern in a 16-bit environment).
*/
maxlength = cinfo->mem->max_alloc_chunk - sizeof(struct jpeg_marker_struct);
- if (((long) length_limit) > maxlength)
- length_limit = (unsigned int) maxlength;
+ if (((long)length_limit) > maxlength)
+ length_limit = (unsigned int)maxlength;
/* Choose processor routine to use.
* APP0/APP14 have special requirements.
@@ -1334,23 +1331,23 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
if (length_limit) {
processor = save_marker;
/* If saving APP0/APP14, save at least enough for our internal use. */
- if (marker_code == (int) M_APP0 && length_limit < APP0_DATA_LEN)
+ if (marker_code == (int)M_APP0 && length_limit < APP0_DATA_LEN)
length_limit = APP0_DATA_LEN;
- else if (marker_code == (int) M_APP14 && length_limit < APP14_DATA_LEN)
+ else if (marker_code == (int)M_APP14 && length_limit < APP14_DATA_LEN)
length_limit = APP14_DATA_LEN;
} else {
processor = skip_variable;
/* If discarding APP0/APP14, use our regular on-the-fly processor. */
- if (marker_code == (int) M_APP0 || marker_code == (int) M_APP14)
+ if (marker_code == (int)M_APP0 || marker_code == (int)M_APP14)
processor = get_interesting_appn;
}
- if (marker_code == (int) M_COM) {
+ if (marker_code == (int)M_COM) {
marker->process_COM = processor;
marker->length_limit_COM = length_limit;
- } else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15) {
- marker->process_APPn[marker_code - (int) M_APP0] = processor;
- marker->length_limit_APPn[marker_code - (int) M_APP0] = length_limit;
+ } else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15) {
+ marker->process_APPn[marker_code - (int)M_APP0] = processor;
+ marker->length_limit_APPn[marker_code - (int)M_APP0] = length_limit;
} else
ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
}
@@ -1363,15 +1360,15 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
*/
GLOBAL(void)
-jpeg_set_marker_processor (j_decompress_ptr cinfo, int marker_code,
- jpeg_marker_parser_method routine)
+jpeg_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+ jpeg_marker_parser_method routine)
{
- my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
- if (marker_code == (int) M_COM)
+ if (marker_code == (int)M_COM)
marker->process_COM = routine;
- else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15)
- marker->process_APPn[marker_code - (int) M_APP0] = routine;
+ else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15)
+ marker->process_APPn[marker_code - (int)M_APP0] = routine;
else
ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
}
diff --git a/media/libjpeg/jdmaster.c b/media/libjpeg/jdmaster.c
index 9079dda65c..a3690bf560 100644
--- a/media/libjpeg/jdmaster.c
+++ b/media/libjpeg/jdmaster.c
@@ -5,7 +5,7 @@
* Copyright (C) 1991-1997, Thomas G. Lane.
* Modified 2002-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2019, 2022, D. R. Commander.
* Copyright (C) 2013, Linaro Limited.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
@@ -22,7 +22,6 @@
#include "jpeglib.h"
#include "jpegcomp.h"
#include "jdmaster.h"
-#include "jsimd.h"
/*
@@ -31,7 +30,7 @@
*/
LOCAL(boolean)
-use_merged_upsample (j_decompress_ptr cinfo)
+use_merged_upsample(j_decompress_ptr cinfo)
{
#ifdef UPSAMPLE_MERGING_SUPPORTED
/* Merging is the equivalent of plain box-filter upsampling */
@@ -40,22 +39,22 @@ use_merged_upsample (j_decompress_ptr cinfo)
/* jdmerge.c only supports YCC=>RGB and YCC=>RGB565 color conversion */
if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
(cinfo->out_color_space != JCS_RGB &&
- cinfo->out_color_space != JCS_RGB565 &&
- cinfo->out_color_space != JCS_EXT_RGB &&
- cinfo->out_color_space != JCS_EXT_RGBX &&
- cinfo->out_color_space != JCS_EXT_BGR &&
- cinfo->out_color_space != JCS_EXT_BGRX &&
- cinfo->out_color_space != JCS_EXT_XBGR &&
- cinfo->out_color_space != JCS_EXT_XRGB &&
- cinfo->out_color_space != JCS_EXT_RGBA &&
- cinfo->out_color_space != JCS_EXT_BGRA &&
- cinfo->out_color_space != JCS_EXT_ABGR &&
- cinfo->out_color_space != JCS_EXT_ARGB))
+ cinfo->out_color_space != JCS_RGB565 &&
+ cinfo->out_color_space != JCS_EXT_RGB &&
+ cinfo->out_color_space != JCS_EXT_RGBX &&
+ cinfo->out_color_space != JCS_EXT_BGR &&
+ cinfo->out_color_space != JCS_EXT_BGRX &&
+ cinfo->out_color_space != JCS_EXT_XBGR &&
+ cinfo->out_color_space != JCS_EXT_XRGB &&
+ cinfo->out_color_space != JCS_EXT_RGBA &&
+ cinfo->out_color_space != JCS_EXT_BGRA &&
+ cinfo->out_color_space != JCS_EXT_ABGR &&
+ cinfo->out_color_space != JCS_EXT_ARGB))
return FALSE;
if ((cinfo->out_color_space == JCS_RGB565 &&
- cinfo->out_color_components != 3) ||
+ cinfo->out_color_components != 3) ||
(cinfo->out_color_space != JCS_RGB565 &&
- cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space]))
+ cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space]))
return FALSE;
/* and it only handles 2h1v or 2h2v sampling ratios */
if (cinfo->comp_info[0].h_samp_factor != 2 ||
@@ -70,17 +69,6 @@ use_merged_upsample (j_decompress_ptr cinfo)
cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
return FALSE;
-#ifdef WITH_SIMD
- /* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
- isn't, then disabling merged upsampling is likely to be faster when
- decompressing YCbCr JPEG images. */
- if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
- jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
- (cinfo->out_color_space == JCS_RGB ||
- (cinfo->out_color_space >= JCS_EXT_RGB &&
- cinfo->out_color_space <= JCS_EXT_ARGB)))
- return FALSE;
-#endif
/* ??? also need to test for upsample-time rescaling, when & if supported */
return TRUE; /* by golly, it'll work... */
#else
@@ -100,7 +88,7 @@ GLOBAL(void)
#else
LOCAL(void)
#endif
-jpeg_core_output_dimensions (j_decompress_ptr cinfo)
+jpeg_core_output_dimensions(j_decompress_ptr cinfo)
/* Do computations that are needed before master selection phase.
* This function is used for transcoding and full decompression.
*/
@@ -113,129 +101,129 @@ jpeg_core_output_dimensions (j_decompress_ptr cinfo)
if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom) {
/* Provide 1/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 1;
cinfo->_min_DCT_v_scaled_size = 1;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 2) {
/* Provide 2/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 2L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 2L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 2L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 2L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 2;
cinfo->_min_DCT_v_scaled_size = 2;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 3) {
/* Provide 3/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 3L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 3L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 3L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 3L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 3;
cinfo->_min_DCT_v_scaled_size = 3;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 4) {
/* Provide 4/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 4L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 4L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 4L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 4L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 4;
cinfo->_min_DCT_v_scaled_size = 4;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 5) {
/* Provide 5/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 5L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 5L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 5L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 5L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 5;
cinfo->_min_DCT_v_scaled_size = 5;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 6) {
/* Provide 6/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 6L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 6L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 6L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 6L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 6;
cinfo->_min_DCT_v_scaled_size = 6;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 7) {
/* Provide 7/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 7L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 7L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 7L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 7L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 7;
cinfo->_min_DCT_v_scaled_size = 7;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 8) {
/* Provide 8/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 8L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 8L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 8L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 8L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 8;
cinfo->_min_DCT_v_scaled_size = 8;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 9) {
/* Provide 9/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 9L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 9L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 9L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 9L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 9;
cinfo->_min_DCT_v_scaled_size = 9;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 10) {
/* Provide 10/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 10L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 10L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 10L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 10L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 10;
cinfo->_min_DCT_v_scaled_size = 10;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 11) {
/* Provide 11/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 11L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 11L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 11L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 11L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 11;
cinfo->_min_DCT_v_scaled_size = 11;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 12) {
/* Provide 12/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 12L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 12L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 12L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 12L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 12;
cinfo->_min_DCT_v_scaled_size = 12;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 13) {
/* Provide 13/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 13L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 13L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 13L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 13L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 13;
cinfo->_min_DCT_v_scaled_size = 13;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 14) {
/* Provide 14/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 14L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 14L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 14L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 14L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 14;
cinfo->_min_DCT_v_scaled_size = 14;
} else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 15) {
/* Provide 15/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 15L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 15L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 15L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 15L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 15;
cinfo->_min_DCT_v_scaled_size = 15;
} else {
/* Provide 16/block_size scaling */
cinfo->output_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * 16L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_width * 16L, (long)DCTSIZE);
cinfo->output_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * 16L, (long) DCTSIZE);
+ jdiv_round_up((long)cinfo->image_height * 16L, (long)DCTSIZE);
cinfo->_min_DCT_h_scaled_size = 16;
cinfo->_min_DCT_v_scaled_size = 16;
}
@@ -268,7 +256,7 @@ jpeg_core_output_dimensions (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
+jpeg_calc_output_dimensions(j_decompress_ptr cinfo)
/* Do computations that are needed before master selection phase */
{
#ifdef IDCT_SCALING_SUPPORTED
@@ -314,13 +302,13 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
ci++, compptr++) {
/* Size in samples, after IDCT scaling */
compptr->downsampled_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width *
- (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size),
- (long) (cinfo->max_h_samp_factor * DCTSIZE));
+ jdiv_round_up((long)cinfo->image_width *
+ (long)(compptr->h_samp_factor * compptr->_DCT_scaled_size),
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
compptr->downsampled_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height *
- (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size),
- (long) (cinfo->max_v_samp_factor * DCTSIZE));
+ jdiv_round_up((long)cinfo->image_height *
+ (long)(compptr->v_samp_factor * compptr->_DCT_scaled_size),
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
}
#else /* !IDCT_SCALING_SUPPORTED */
@@ -417,31 +405,31 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
*/
LOCAL(void)
-prepare_range_limit_table (j_decompress_ptr cinfo)
+prepare_range_limit_table(j_decompress_ptr cinfo)
/* Allocate and fill in the sample_range_limit table */
{
JSAMPLE *table;
int i;
table = (JSAMPLE *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
- table += (MAXJSAMPLE+1); /* allow negative subscripts of simple table */
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (5 * (MAXJSAMPLE + 1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
+ table += (MAXJSAMPLE + 1); /* allow negative subscripts of simple table */
cinfo->sample_range_limit = table;
/* First segment of "simple" table: limit[x] = 0 for x < 0 */
- MEMZERO(table - (MAXJSAMPLE+1), (MAXJSAMPLE+1) * sizeof(JSAMPLE));
+ memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
/* Main part of "simple" table: limit[x] = x */
for (i = 0; i <= MAXJSAMPLE; i++)
- table[i] = (JSAMPLE) i;
+ table[i] = (JSAMPLE)i;
table += CENTERJSAMPLE; /* Point to where post-IDCT table starts */
/* End of simple table, rest of first half of post-IDCT table */
- for (i = CENTERJSAMPLE; i < 2*(MAXJSAMPLE+1); i++)
+ for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
table[i] = MAXJSAMPLE;
/* Second half of post-IDCT table */
- MEMZERO(table + (2 * (MAXJSAMPLE+1)),
- (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
- MEMCOPY(table + (4 * (MAXJSAMPLE+1) - CENTERJSAMPLE),
- cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
+ memset(table + (2 * (MAXJSAMPLE + 1)), 0,
+ (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
+ memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
+ cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
}
@@ -457,9 +445,9 @@ prepare_range_limit_table (j_decompress_ptr cinfo)
*/
LOCAL(void)
-master_selection (j_decompress_ptr cinfo)
+master_selection(j_decompress_ptr cinfo)
{
- my_master_ptr master = (my_master_ptr) cinfo->master;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
boolean use_c_buffer;
long samplesperrow;
JDIMENSION jd_samplesperrow;
@@ -469,9 +457,10 @@ master_selection (j_decompress_ptr cinfo)
prepare_range_limit_table(cinfo);
/* Width of an output scanline must be representable as JDIMENSION. */
- samplesperrow = (long) cinfo->output_width * (long) cinfo->out_color_components;
- jd_samplesperrow = (JDIMENSION) samplesperrow;
- if ((long) jd_samplesperrow != samplesperrow)
+ samplesperrow = (long)cinfo->output_width *
+ (long)cinfo->out_color_components;
+ jd_samplesperrow = (JDIMENSION)samplesperrow;
+ if ((long)jd_samplesperrow != samplesperrow)
ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
/* Initialize my private state */
@@ -482,7 +471,7 @@ master_selection (j_decompress_ptr cinfo)
master->quantizer_1pass = NULL;
master->quantizer_2pass = NULL;
/* No mode changes if not using buffered-image mode. */
- if (! cinfo->quantize_colors || ! cinfo->buffered_image) {
+ if (!cinfo->quantize_colors || !cinfo->buffered_image) {
cinfo->enable_1pass_quant = FALSE;
cinfo->enable_external_quant = FALSE;
cinfo->enable_2pass_quant = FALSE;
@@ -528,7 +517,7 @@ master_selection (j_decompress_ptr cinfo)
}
/* Post-processing: in particular, color conversion first */
- if (! cinfo->raw_data_out) {
+ if (!cinfo->raw_data_out) {
if (master->using_merged_upsample) {
#ifdef UPSAMPLE_MERGING_SUPPORTED
jinit_merged_upsampler(cinfo); /* does color conversion too */
@@ -565,11 +554,11 @@ master_selection (j_decompress_ptr cinfo)
use_c_buffer = cinfo->inputctl->has_multiple_scans || cinfo->buffered_image;
jinit_d_coef_controller(cinfo, use_c_buffer);
- if (! cinfo->raw_data_out)
+ if (!cinfo->raw_data_out)
jinit_d_main_controller(cinfo, FALSE /* never need full buffer here */);
/* We can now tell the memory manager to allocate virtual arrays. */
- (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+ (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
/* Initialize input side of decompressor to consume first scan. */
(*cinfo->inputctl->start_input_pass) (cinfo);
@@ -579,13 +568,14 @@ master_selection (j_decompress_ptr cinfo)
*/
cinfo->master->first_iMCU_col = 0;
cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+ cinfo->master->last_good_iMCU_row = 0;
#ifdef D_MULTISCAN_FILES_SUPPORTED
/* If jpeg_start_decompress will read the whole file, initialize
* progress monitoring appropriately. The input step is counted
* as one pass.
*/
- if (cinfo->progress != NULL && ! cinfo->buffered_image &&
+ if (cinfo->progress != NULL && !cinfo->buffered_image &&
cinfo->inputctl->has_multiple_scans) {
int nscans;
/* Estimate number of scans to set pass_limit. */
@@ -597,7 +587,7 @@ master_selection (j_decompress_ptr cinfo)
nscans = cinfo->num_components;
}
cinfo->progress->pass_counter = 0L;
- cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+ cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
cinfo->progress->completed_passes = 0;
cinfo->progress->total_passes = (cinfo->enable_2pass_quant ? 3 : 2);
/* Count the input pass as done */
@@ -617,9 +607,9 @@ master_selection (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-prepare_for_output_pass (j_decompress_ptr cinfo)
+prepare_for_output_pass(j_decompress_ptr cinfo)
{
- my_master_ptr master = (my_master_ptr) cinfo->master;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
if (master->pub.is_dummy_pass) {
#ifdef QUANT_2PASS_SUPPORTED
@@ -645,8 +635,8 @@ prepare_for_output_pass (j_decompress_ptr cinfo)
}
(*cinfo->idct->start_pass) (cinfo);
(*cinfo->coef->start_output_pass) (cinfo);
- if (! cinfo->raw_data_out) {
- if (! master->using_merged_upsample)
+ if (!cinfo->raw_data_out) {
+ if (!master->using_merged_upsample)
(*cinfo->cconvert->start_pass) (cinfo);
(*cinfo->upsample->start_pass) (cinfo);
if (cinfo->quantize_colors)
@@ -665,7 +655,7 @@ prepare_for_output_pass (j_decompress_ptr cinfo)
/* In buffered-image mode, we assume one more output pass if EOI not
* yet reached, but no more passes if EOI has been reached.
*/
- if (cinfo->buffered_image && ! cinfo->inputctl->eoi_reached) {
+ if (cinfo->buffered_image && !cinfo->inputctl->eoi_reached) {
cinfo->progress->total_passes += (cinfo->enable_2pass_quant ? 2 : 1);
}
}
@@ -677,9 +667,9 @@ prepare_for_output_pass (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-finish_output_pass (j_decompress_ptr cinfo)
+finish_output_pass(j_decompress_ptr cinfo)
{
- my_master_ptr master = (my_master_ptr) cinfo->master;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
if (cinfo->quantize_colors)
(*cinfo->cquantize->finish_pass) (cinfo);
@@ -694,9 +684,9 @@ finish_output_pass (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jpeg_new_colormap (j_decompress_ptr cinfo)
+jpeg_new_colormap(j_decompress_ptr cinfo)
{
- my_master_ptr master = (my_master_ptr) cinfo->master;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
/* Prevent application from calling me at wrong times */
if (cinfo->global_state != DSTATE_BUFIMAGE)
@@ -722,9 +712,9 @@ jpeg_new_colormap (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jinit_master_decompress (j_decompress_ptr cinfo)
+jinit_master_decompress(j_decompress_ptr cinfo)
{
- my_master_ptr master = (my_master_ptr) cinfo->master;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
master->pub.prepare_for_output_pass = prepare_for_output_pass;
master->pub.finish_output_pass = finish_output_pass;
diff --git a/media/libjpeg/jdmerge.c b/media/libjpeg/jdmerge.c
index 6276dd0950..3a456d6581 100644
--- a/media/libjpeg/jdmerge.c
+++ b/media/libjpeg/jdmerge.c
@@ -5,7 +5,7 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2014-2015, 2020, D. R. Commander.
* Copyright (C) 2013, Linaro Limited.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
@@ -40,44 +40,16 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
+#include "jdmerge.h"
#include "jsimd.h"
#include "jconfigint.h"
#ifdef UPSAMPLE_MERGING_SUPPORTED
-/* Private subobject */
-
-typedef struct {
- struct jpeg_upsampler pub; /* public fields */
-
- /* Pointer to routine to do actual upsampling/conversion of one row group */
- void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-
- /* Private state for YCC->RGB conversion */
- int *Cr_r_tab; /* => table for Cr to R conversion */
- int *Cb_b_tab; /* => table for Cb to B conversion */
- JLONG *Cr_g_tab; /* => table for Cr to G conversion */
- JLONG *Cb_g_tab; /* => table for Cb to G conversion */
-
- /* For 2:1 vertical sampling, we produce two output rows at a time.
- * We need a "spare" row buffer to hold the second output row if the
- * application provides just a one-row buffer; we also use the spare
- * to discard the dummy last row if the image height is odd.
- */
- JSAMPROW spare_row;
- boolean spare_full; /* T if spare buffer is occupied */
-
- JDIMENSION out_row_width; /* samples per output row */
- JDIMENSION rows_to_go; /* counts rows remaining in image */
-} my_upsampler;
-
-typedef my_upsampler *my_upsample_ptr;
-
#define SCALEBITS 16 /* speediest right-shift on some machines */
-#define ONE_HALF ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x) ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x) ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
/* Include inline routines for colorspace extensions */
@@ -88,12 +60,12 @@ typedef my_upsampler *my_upsample_ptr;
#undef RGB_BLUE
#undef RGB_PIXELSIZE
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define h2v1_merged_upsample_internal extrgb_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extrgb_h2v2_merged_upsample_internal
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define h2v1_merged_upsample_internal extrgb_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extrgb_h2v2_merged_upsample_internal
#include "jdmrgext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -102,13 +74,13 @@ typedef my_upsampler *my_upsample_ptr;
#undef h2v1_merged_upsample_internal
#undef h2v2_merged_upsample_internal
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define h2v1_merged_upsample_internal extrgbx_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extrgbx_h2v2_merged_upsample_internal
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define h2v1_merged_upsample_internal extrgbx_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extrgbx_h2v2_merged_upsample_internal
#include "jdmrgext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -118,12 +90,12 @@ typedef my_upsampler *my_upsample_ptr;
#undef h2v1_merged_upsample_internal
#undef h2v2_merged_upsample_internal
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define h2v1_merged_upsample_internal extbgr_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extbgr_h2v2_merged_upsample_internal
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define h2v1_merged_upsample_internal extbgr_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extbgr_h2v2_merged_upsample_internal
#include "jdmrgext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -132,13 +104,13 @@ typedef my_upsampler *my_upsample_ptr;
#undef h2v1_merged_upsample_internal
#undef h2v2_merged_upsample_internal
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define h2v1_merged_upsample_internal extbgrx_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extbgrx_h2v2_merged_upsample_internal
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define h2v1_merged_upsample_internal extbgrx_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extbgrx_h2v2_merged_upsample_internal
#include "jdmrgext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -148,13 +120,13 @@ typedef my_upsampler *my_upsample_ptr;
#undef h2v1_merged_upsample_internal
#undef h2v2_merged_upsample_internal
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define h2v1_merged_upsample_internal extxbgr_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extxbgr_h2v2_merged_upsample_internal
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define h2v1_merged_upsample_internal extxbgr_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extxbgr_h2v2_merged_upsample_internal
#include "jdmrgext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -164,13 +136,13 @@ typedef my_upsampler *my_upsample_ptr;
#undef h2v1_merged_upsample_internal
#undef h2v2_merged_upsample_internal
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define h2v1_merged_upsample_internal extxrgb_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extxrgb_h2v2_merged_upsample_internal
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define h2v1_merged_upsample_internal extxrgb_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extxrgb_h2v2_merged_upsample_internal
#include "jdmrgext.c"
#undef RGB_RED
#undef RGB_GREEN
@@ -187,25 +159,25 @@ typedef my_upsampler *my_upsample_ptr;
*/
LOCAL(void)
-build_ycc_rgb_table (j_decompress_ptr cinfo)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
int i;
JLONG x;
SHIFT_TEMPS
upsample->Cr_r_tab = (int *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (MAXJSAMPLE+1) * sizeof(int));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(int));
upsample->Cb_b_tab = (int *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (MAXJSAMPLE+1) * sizeof(int));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(int));
upsample->Cr_g_tab = (JLONG *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (MAXJSAMPLE+1) * sizeof(JLONG));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(JLONG));
upsample->Cb_g_tab = (JLONG *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (MAXJSAMPLE+1) * sizeof(JLONG));
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(JLONG));
for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
/* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -217,10 +189,10 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
upsample->Cb_b_tab[i] = (int)
RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
/* Cr=>G value is scaled-up -0.71414 * x */
- upsample->Cr_g_tab[i] = (- FIX(0.71414)) * x;
+ upsample->Cr_g_tab[i] = (-FIX(0.71414)) * x;
/* Cb=>G value is scaled-up -0.34414 * x */
/* We also add in ONE_HALF so that need not do it in inner loop */
- upsample->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
+ upsample->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
}
}
@@ -230,9 +202,9 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-start_pass_merged_upsample (j_decompress_ptr cinfo)
+start_pass_merged_upsample(j_decompress_ptr cinfo)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
/* Mark the spare buffer empty */
upsample->spare_full = FALSE;
@@ -248,14 +220,13 @@ start_pass_merged_upsample (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-merged_2v_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
- JDIMENSION in_row_groups_avail,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail)
+merged_2v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
/* 2:1 vertical sampling case: may need a spare row. */
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
JSAMPROW work_ptrs[2];
JDIMENSION num_rows; /* number of rows returned to caller */
@@ -264,8 +235,8 @@ merged_2v_upsample (j_decompress_ptr cinfo,
JDIMENSION size = upsample->out_row_width;
if (cinfo->out_color_space == JCS_RGB565)
size = cinfo->output_width * 2;
- jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
- 1, size);
+ jcopy_sample_rows(&upsample->spare_row, 0, output_buf + *out_row_ctr, 0, 1,
+ size);
num_rows = 1;
upsample->spare_full = FALSE;
} else {
@@ -294,20 +265,19 @@ merged_2v_upsample (j_decompress_ptr cinfo,
*out_row_ctr += num_rows;
upsample->rows_to_go -= num_rows;
/* When the buffer is emptied, declare this input row group consumed */
- if (! upsample->spare_full)
+ if (!upsample->spare_full)
(*in_row_group_ctr)++;
}
METHODDEF(void)
-merged_1v_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
- JDIMENSION in_row_groups_avail,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail)
+merged_1v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
/* 1:1 vertical sampling case: much easier, never need a spare row. */
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
/* Just do the upsampling. */
(*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr,
@@ -333,43 +303,42 @@ merged_1v_upsample (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-h2v1_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_BGR:
- extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- default:
- h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
+ case JCS_EXT_RGB:
+ extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ default:
+ h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
}
}
@@ -379,43 +348,42 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-h2v2_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_BGR:
- extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
- default:
- h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
- output_buf);
- break;
+ case JCS_EXT_RGB:
+ extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ default:
+ h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
}
}
@@ -424,24 +392,21 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
* RGB565 conversion
*/
-#define PACK_SHORT_565_LE(r, g, b) ((((r) << 8) & 0xF800) | \
- (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_SHORT_565_BE(r, g, b) (((r) & 0xF8) | ((g) >> 5) | \
- (((g) << 11) & 0xE000) | \
- (((b) << 5) & 0x1F00))
+#define PACK_SHORT_565_LE(r, g, b) \
+ ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b) \
+ (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
-#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l)
-#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r)
+#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r)
-#define PACK_NEED_ALIGNMENT(ptr) (((size_t)(ptr)) & 3)
-
-#define WRITE_TWO_PIXELS_LE(addr, pixels) { \
- ((INT16*)(addr))[0] = (INT16)(pixels); \
- ((INT16*)(addr))[1] = (INT16)((pixels) >> 16); \
+#define WRITE_TWO_PIXELS_LE(addr, pixels) { \
+ ((INT16 *)(addr))[0] = (INT16)(pixels); \
+ ((INT16 *)(addr))[1] = (INT16)((pixels) >> 16); \
}
-#define WRITE_TWO_PIXELS_BE(addr, pixels) { \
- ((INT16*)(addr))[1] = (INT16)(pixels); \
- ((INT16*)(addr))[0] = (INT16)((pixels) >> 16); \
+#define WRITE_TWO_PIXELS_BE(addr, pixels) { \
+ ((INT16 *)(addr))[1] = (INT16)(pixels); \
+ ((INT16 *)(addr))[0] = (INT16)((pixels) >> 16); \
}
#define DITHER_565_R(r, dither) ((r) + ((dither) & 0xFF))
@@ -452,7 +417,7 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
/* Declarations for ordered dithering
*
* We use a 4x4 ordered dither array packed into 32 bits. This array is
- * sufficent for dithering RGB888 to RGB565.
+ * sufficient for dithering RGB888 to RGB565.
*/
#define DITHER_MASK 0x3
@@ -467,13 +432,13 @@ static const JLONG dither_matrix[4] = {
/* Include inline routines for RGB565 conversion */
-#define PACK_SHORT_565 PACK_SHORT_565_LE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
-#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_LE
-#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_le
-#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_le
-#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_le
-#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_le
+#define PACK_SHORT_565 PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
+#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_LE
+#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_le
+#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_le
+#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_le
+#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_le
#include "jdmrg565.c"
#undef PACK_SHORT_565
#undef PACK_TWO_PIXELS
@@ -483,13 +448,13 @@ static const JLONG dither_matrix[4] = {
#undef h2v2_merged_upsample_565_internal
#undef h2v2_merged_upsample_565D_internal
-#define PACK_SHORT_565 PACK_SHORT_565_BE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
-#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_BE
-#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_be
-#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_be
-#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_be
-#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_be
+#define PACK_SHORT_565 PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
+#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_BE
+#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_be
+#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_be
+#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_be
+#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_be
#include "jdmrg565.c"
#undef PACK_SHORT_565
#undef PACK_TWO_PIXELS
@@ -503,16 +468,15 @@ static const JLONG dither_matrix[4] = {
static INLINE boolean is_big_endian(void)
{
int test_value = 1;
- if(*(char *)&test_value != 1)
+ if (*(char *)&test_value != 1)
return TRUE;
return FALSE;
}
METHODDEF(void)
-h2v1_merged_upsample_565 (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v1_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
if (is_big_endian())
h2v1_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -520,13 +484,12 @@ h2v1_merged_upsample_565 (j_decompress_ptr cinfo,
else
h2v1_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr,
output_buf);
- }
+}
METHODDEF(void)
-h2v1_merged_upsample_565D (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v1_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
if (is_big_endian())
h2v1_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -538,9 +501,8 @@ h2v1_merged_upsample_565D (j_decompress_ptr cinfo,
METHODDEF(void)
-h2v2_merged_upsample_565 (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v2_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
if (is_big_endian())
h2v2_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -552,9 +514,8 @@ h2v2_merged_upsample_565 (j_decompress_ptr cinfo,
METHODDEF(void)
-h2v2_merged_upsample_565D (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v2_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
if (is_big_endian())
h2v2_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -574,14 +535,14 @@ h2v2_merged_upsample_565D (j_decompress_ptr cinfo,
*/
GLOBAL(void)
-jinit_merged_upsampler (j_decompress_ptr cinfo)
+jinit_merged_upsampler(j_decompress_ptr cinfo)
{
- my_upsample_ptr upsample;
+ my_merged_upsample_ptr upsample;
- upsample = (my_upsample_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- sizeof(my_upsampler));
- cinfo->upsample = (struct jpeg_upsampler *) upsample;
+ upsample = (my_merged_upsample_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_merged_upsampler));
+ cinfo->upsample = (struct jpeg_upsampler *)upsample;
upsample->pub.start_pass = start_pass_merged_upsample;
upsample->pub.need_context_rows = FALSE;
@@ -602,8 +563,8 @@ jinit_merged_upsampler (j_decompress_ptr cinfo)
}
/* Allocate a spare row buffer */
upsample->spare_row = (JSAMPROW)
- (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (size_t) (upsample->out_row_width * sizeof(JSAMPLE)));
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (size_t)(upsample->out_row_width * sizeof(JSAMPLE)));
} else {
upsample->pub.upsample = merged_1v_upsample;
if (jsimd_can_h2v1_merged_upsample())
diff --git a/media/libjpeg/jdmerge.h b/media/libjpeg/jdmerge.h
new file mode 100644
index 0000000000..b583396b10
--- /dev/null
+++ b/media/libjpeg/jdmerge.h
@@ -0,0 +1,47 @@
+/*
+ * jdmerge.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+
+
+/* Private subobject */
+
+typedef struct {
+ struct jpeg_upsampler pub; /* public fields */
+
+ /* Pointer to routine to do actual upsampling/conversion of one row group */
+ void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
+ /* Private state for YCC->RGB conversion */
+ int *Cr_r_tab; /* => table for Cr to R conversion */
+ int *Cb_b_tab; /* => table for Cb to B conversion */
+ JLONG *Cr_g_tab; /* => table for Cr to G conversion */
+ JLONG *Cb_g_tab; /* => table for Cb to G conversion */
+
+ /* For 2:1 vertical sampling, we produce two output rows at a time.
+ * We need a "spare" row buffer to hold the second output row if the
+ * application provides just a one-row buffer; we also use the spare
+ * to discard the dummy last row if the image height is odd.
+ */
+ JSAMPROW spare_row;
+ boolean spare_full; /* T if spare buffer is occupied */
+
+ JDIMENSION out_row_width; /* samples per output row */
+ JDIMENSION rows_to_go; /* counts rows remaining in image */
+} my_merged_upsampler;
+
+typedef my_merged_upsampler *my_merged_upsample_ptr;
+
+#endif /* UPSAMPLE_MERGING_SUPPORTED */
diff --git a/media/libjpeg/jdmrg565.c b/media/libjpeg/jdmrg565.c
index 18287b3735..980a4e216e 100644
--- a/media/libjpeg/jdmrg565.c
+++ b/media/libjpeg/jdmrg565.c
@@ -5,7 +5,7 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -15,23 +15,22 @@
INLINE
LOCAL(void)
-h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
register int y, cred, cgreen, cblue;
int cb, cr;
register JSAMPROW outptr;
JSAMPROW inptr0, inptr1, inptr2;
JDIMENSION col;
/* copy these pointers into registers if possible */
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
- int * Crrtab = upsample->Cr_r_tab;
- int * Cbbtab = upsample->Cb_b_tab;
- JLONG * Crgtab = upsample->Cr_g_tab;
- JLONG * Cbgtab = upsample->Cb_g_tab;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
unsigned int r, g, b;
JLONG rgb;
SHIFT_TEMPS
@@ -44,20 +43,20 @@ h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
/* Loop for each pair of output pixels */
for (col = cinfo->output_width >> 1; col > 0; col--) {
/* Do the chroma part of the calculation */
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ cb = *inptr1++;
+ cr = *inptr2++;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
/* Fetch 2 Y values and emit 2 pixels */
- y = GETJSAMPLE(*inptr0++);
+ y = *inptr0++;
r = range_limit[y + cred];
g = range_limit[y + cgreen];
b = range_limit[y + cblue];
rgb = PACK_SHORT_565(r, g, b);
- y = GETJSAMPLE(*inptr0++);
+ y = *inptr0++;
r = range_limit[y + cred];
g = range_limit[y + cgreen];
b = range_limit[y + cblue];
@@ -69,40 +68,40 @@ h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
/* If image width is odd, do the last output column separately */
if (cinfo->output_width & 1) {
- cb = GETJSAMPLE(*inptr1);
- cr = GETJSAMPLE(*inptr2);
+ cb = *inptr1;
+ cr = *inptr2;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
- y = GETJSAMPLE(*inptr0);
+ y = *inptr0;
r = range_limit[y + cred];
g = range_limit[y + cgreen];
b = range_limit[y + cblue];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
- }
- }
+ *(INT16 *)outptr = (INT16)rgb;
+ }
+}
INLINE
LOCAL(void)
-h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
register int y, cred, cgreen, cblue;
int cb, cr;
register JSAMPROW outptr;
JSAMPROW inptr0, inptr1, inptr2;
JDIMENSION col;
/* copy these pointers into registers if possible */
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
- int * Crrtab = upsample->Cr_r_tab;
- int * Cbbtab = upsample->Cb_b_tab;
- JLONG * Crgtab = upsample->Cr_g_tab;
- JLONG * Cbgtab = upsample->Cb_g_tab;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
unsigned int r, g, b;
JLONG rgb;
@@ -116,21 +115,21 @@ h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
/* Loop for each pair of output pixels */
for (col = cinfo->output_width >> 1; col > 0; col--) {
/* Do the chroma part of the calculation */
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ cb = *inptr1++;
+ cr = *inptr2++;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
/* Fetch 2 Y values and emit 2 pixels */
- y = GETJSAMPLE(*inptr0++);
+ y = *inptr0++;
r = range_limit[DITHER_565_R(y + cred, d0)];
g = range_limit[DITHER_565_G(y + cgreen, d0)];
b = range_limit[DITHER_565_B(y + cblue, d0)];
d0 = DITHER_ROTATE(d0);
rgb = PACK_SHORT_565(r, g, b);
- y = GETJSAMPLE(*inptr0++);
+ y = *inptr0++;
r = range_limit[DITHER_565_R(y + cred, d0)];
g = range_limit[DITHER_565_G(y + cgreen, d0)];
b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -143,40 +142,39 @@ h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
/* If image width is odd, do the last output column separately */
if (cinfo->output_width & 1) {
- cb = GETJSAMPLE(*inptr1);
- cr = GETJSAMPLE(*inptr2);
+ cb = *inptr1;
+ cr = *inptr2;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
- y = GETJSAMPLE(*inptr0);
+ y = *inptr0;
r = range_limit[DITHER_565_R(y + cred, d0)];
g = range_limit[DITHER_565_G(y + cgreen, d0)];
b = range_limit[DITHER_565_B(y + cblue, d0)];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr = (INT16)rgb;
+ *(INT16 *)outptr = (INT16)rgb;
}
}
INLINE
LOCAL(void)
-h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
register int y, cred, cgreen, cblue;
int cb, cr;
register JSAMPROW outptr0, outptr1;
JSAMPROW inptr00, inptr01, inptr1, inptr2;
JDIMENSION col;
/* copy these pointers into registers if possible */
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
- int * Crrtab = upsample->Cr_r_tab;
- int * Cbbtab = upsample->Cb_b_tab;
- JLONG * Crgtab = upsample->Cr_g_tab;
- JLONG * Cbgtab = upsample->Cb_g_tab;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
unsigned int r, g, b;
JLONG rgb;
SHIFT_TEMPS
@@ -191,20 +189,20 @@ h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
/* Loop for each group of output pixels */
for (col = cinfo->output_width >> 1; col > 0; col--) {
/* Do the chroma part of the calculation */
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ cb = *inptr1++;
+ cr = *inptr2++;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
/* Fetch 4 Y values and emit 4 pixels */
- y = GETJSAMPLE(*inptr00++);
+ y = *inptr00++;
r = range_limit[y + cred];
g = range_limit[y + cgreen];
b = range_limit[y + cblue];
rgb = PACK_SHORT_565(r, g, b);
- y = GETJSAMPLE(*inptr00++);
+ y = *inptr00++;
r = range_limit[y + cred];
g = range_limit[y + cgreen];
b = range_limit[y + cblue];
@@ -213,13 +211,13 @@ h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
WRITE_TWO_PIXELS(outptr0, rgb);
outptr0 += 4;
- y = GETJSAMPLE(*inptr01++);
+ y = *inptr01++;
r = range_limit[y + cred];
g = range_limit[y + cgreen];
b = range_limit[y + cblue];
rgb = PACK_SHORT_565(r, g, b);
- y = GETJSAMPLE(*inptr01++);
+ y = *inptr01++;
r = range_limit[y + cred];
g = range_limit[y + cgreen];
b = range_limit[y + cblue];
@@ -231,56 +229,56 @@ h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
/* If image width is odd, do the last output column separately */
if (cinfo->output_width & 1) {
- cb = GETJSAMPLE(*inptr1);
- cr = GETJSAMPLE(*inptr2);
+ cb = *inptr1;
+ cr = *inptr2;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
- y = GETJSAMPLE(*inptr00);
+ y = *inptr00;
r = range_limit[y + cred];
g = range_limit[y + cgreen];
b = range_limit[y + cblue];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr0 = (INT16)rgb;
+ *(INT16 *)outptr0 = (INT16)rgb;
- y = GETJSAMPLE(*inptr01);
+ y = *inptr01;
r = range_limit[y + cred];
g = range_limit[y + cgreen];
b = range_limit[y + cblue];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr1 = (INT16)rgb;
+ *(INT16 *)outptr1 = (INT16)rgb;
}
}
INLINE
LOCAL(void)
-h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
register int y, cred, cgreen, cblue;
int cb, cr;
register JSAMPROW outptr0, outptr1;
JSAMPROW inptr00, inptr01, inptr1, inptr2;
JDIMENSION col;
/* copy these pointers into registers if possible */
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
- int * Crrtab = upsample->Cr_r_tab;
- int * Cbbtab = upsample->Cb_b_tab;
- JLONG * Crgtab = upsample->Cr_g_tab;
- JLONG * Cbgtab = upsample->Cb_g_tab;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
- JLONG d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
+ JLONG d1 = dither_matrix[(cinfo->output_scanline + 1) & DITHER_MASK];
unsigned int r, g, b;
JLONG rgb;
SHIFT_TEMPS
- inptr00 = input_buf[0][in_row_group_ctr*2];
- inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+ inptr00 = input_buf[0][in_row_group_ctr * 2];
+ inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
inptr1 = input_buf[1][in_row_group_ctr];
inptr2 = input_buf[2][in_row_group_ctr];
outptr0 = output_buf[0];
@@ -289,38 +287,38 @@ h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
/* Loop for each group of output pixels */
for (col = cinfo->output_width >> 1; col > 0; col--) {
/* Do the chroma part of the calculation */
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ cb = *inptr1++;
+ cr = *inptr2++;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
/* Fetch 4 Y values and emit 4 pixels */
- y = GETJSAMPLE(*inptr00++);
+ y = *inptr00++;
r = range_limit[DITHER_565_R(y + cred, d0)];
g = range_limit[DITHER_565_G(y + cgreen, d0)];
b = range_limit[DITHER_565_B(y + cblue, d0)];
d0 = DITHER_ROTATE(d0);
rgb = PACK_SHORT_565(r, g, b);
- y = GETJSAMPLE(*inptr00++);
- r = range_limit[DITHER_565_R(y + cred, d1)];
- g = range_limit[DITHER_565_G(y + cgreen, d1)];
- b = range_limit[DITHER_565_B(y + cblue, d1)];
- d1 = DITHER_ROTATE(d1);
+ y = *inptr00++;
+ r = range_limit[DITHER_565_R(y + cred, d0)];
+ g = range_limit[DITHER_565_G(y + cgreen, d0)];
+ b = range_limit[DITHER_565_B(y + cblue, d0)];
+ d0 = DITHER_ROTATE(d0);
rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
WRITE_TWO_PIXELS(outptr0, rgb);
outptr0 += 4;
- y = GETJSAMPLE(*inptr01++);
- r = range_limit[DITHER_565_R(y + cred, d0)];
- g = range_limit[DITHER_565_G(y + cgreen, d0)];
- b = range_limit[DITHER_565_B(y + cblue, d0)];
- d0 = DITHER_ROTATE(d0);
+ y = *inptr01++;
+ r = range_limit[DITHER_565_R(y + cred, d1)];
+ g = range_limit[DITHER_565_G(y + cgreen, d1)];
+ b = range_limit[DITHER_565_B(y + cblue, d1)];
+ d1 = DITHER_ROTATE(d1);
rgb = PACK_SHORT_565(r, g, b);
- y = GETJSAMPLE(*inptr01++);
+ y = *inptr01++;
r = range_limit[DITHER_565_R(y + cred, d1)];
g = range_limit[DITHER_565_G(y + cgreen, d1)];
b = range_limit[DITHER_565_B(y + cblue, d1)];
@@ -333,24 +331,24 @@ h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
/* If image width is odd, do the last output column separately */
if (cinfo->output_width & 1) {
- cb = GETJSAMPLE(*inptr1);
- cr = GETJSAMPLE(*inptr2);
+ cb = *inptr1;
+ cr = *inptr2;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
- y = GETJSAMPLE(*inptr00);
+ y = *inptr00;
r = range_limit[DITHER_565_R(y + cred, d0)];
g = range_limit[DITHER_565_G(y + cgreen, d0)];
b = range_limit[DITHER_565_B(y + cblue, d0)];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr0 = (INT16)rgb;
+ *(INT16 *)outptr0 = (INT16)rgb;
- y = GETJSAMPLE(*inptr01);
+ y = *inptr01;
r = range_limit[DITHER_565_R(y + cred, d1)];
g = range_limit[DITHER_565_G(y + cgreen, d1)];
b = range_limit[DITHER_565_B(y + cblue, d1)];
rgb = PACK_SHORT_565(r, g, b);
- *(INT16*)outptr1 = (INT16)rgb;
+ *(INT16 *)outptr1 = (INT16)rgb;
}
}
diff --git a/media/libjpeg/jdmrgext.c b/media/libjpeg/jdmrgext.c
index 9d7d2af2e9..9bf4f1a307 100644
--- a/media/libjpeg/jdmrgext.c
+++ b/media/libjpeg/jdmrgext.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1996, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2015, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -21,23 +21,22 @@
INLINE
LOCAL(void)
-h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
register int y, cred, cgreen, cblue;
int cb, cr;
register JSAMPROW outptr;
JSAMPROW inptr0, inptr1, inptr2;
JDIMENSION col;
/* copy these pointers into registers if possible */
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
- int * Crrtab = upsample->Cr_r_tab;
- int * Cbbtab = upsample->Cb_b_tab;
- JLONG * Crgtab = upsample->Cr_g_tab;
- JLONG * Cbgtab = upsample->Cb_g_tab;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
SHIFT_TEMPS
inptr0 = input_buf[0][in_row_group_ctr];
@@ -47,13 +46,13 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
/* Loop for each pair of output pixels */
for (col = cinfo->output_width >> 1; col > 0; col--) {
/* Do the chroma part of the calculation */
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ cb = *inptr1++;
+ cr = *inptr2++;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
/* Fetch 2 Y values and emit 2 pixels */
- y = GETJSAMPLE(*inptr0++);
+ y = *inptr0++;
outptr[RGB_RED] = range_limit[y + cred];
outptr[RGB_GREEN] = range_limit[y + cgreen];
outptr[RGB_BLUE] = range_limit[y + cblue];
@@ -61,7 +60,7 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
outptr[RGB_ALPHA] = 0xFF;
#endif
outptr += RGB_PIXELSIZE;
- y = GETJSAMPLE(*inptr0++);
+ y = *inptr0++;
outptr[RGB_RED] = range_limit[y + cred];
outptr[RGB_GREEN] = range_limit[y + cgreen];
outptr[RGB_BLUE] = range_limit[y + cblue];
@@ -72,12 +71,12 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
}
/* If image width is odd, do the last output column separately */
if (cinfo->output_width & 1) {
- cb = GETJSAMPLE(*inptr1);
- cr = GETJSAMPLE(*inptr2);
+ cb = *inptr1;
+ cr = *inptr2;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
- y = GETJSAMPLE(*inptr0);
+ y = *inptr0;
outptr[RGB_RED] = range_limit[y + cred];
outptr[RGB_GREEN] = range_limit[y + cgreen];
outptr[RGB_BLUE] = range_limit[y + cblue];
@@ -94,27 +93,26 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
INLINE
LOCAL(void)
-h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
register int y, cred, cgreen, cblue;
int cb, cr;
register JSAMPROW outptr0, outptr1;
JSAMPROW inptr00, inptr01, inptr1, inptr2;
JDIMENSION col;
/* copy these pointers into registers if possible */
- register JSAMPLE * range_limit = cinfo->sample_range_limit;
- int * Crrtab = upsample->Cr_r_tab;
- int * Cbbtab = upsample->Cb_b_tab;
- JLONG * Crgtab = upsample->Cr_g_tab;
- JLONG * Cbgtab = upsample->Cb_g_tab;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
SHIFT_TEMPS
- inptr00 = input_buf[0][in_row_group_ctr*2];
- inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+ inptr00 = input_buf[0][in_row_group_ctr * 2];
+ inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
inptr1 = input_buf[1][in_row_group_ctr];
inptr2 = input_buf[2][in_row_group_ctr];
outptr0 = output_buf[0];
@@ -122,13 +120,13 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
/* Loop for each group of output pixels */
for (col = cinfo->output_width >> 1; col > 0; col--) {
/* Do the chroma part of the calculation */
- cb = GETJSAMPLE(*inptr1++);
- cr = GETJSAMPLE(*inptr2++);
+ cb = *inptr1++;
+ cr = *inptr2++;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
/* Fetch 4 Y values and emit 4 pixels */
- y = GETJSAMPLE(*inptr00++);
+ y = *inptr00++;
outptr0[RGB_RED] = range_limit[y + cred];
outptr0[RGB_GREEN] = range_limit[y + cgreen];
outptr0[RGB_BLUE] = range_limit[y + cblue];
@@ -136,7 +134,7 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
outptr0[RGB_ALPHA] = 0xFF;
#endif
outptr0 += RGB_PIXELSIZE;
- y = GETJSAMPLE(*inptr00++);
+ y = *inptr00++;
outptr0[RGB_RED] = range_limit[y + cred];
outptr0[RGB_GREEN] = range_limit[y + cgreen];
outptr0[RGB_BLUE] = range_limit[y + cblue];
@@ -144,7 +142,7 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
outptr0[RGB_ALPHA] = 0xFF;
#endif
outptr0 += RGB_PIXELSIZE;
- y = GETJSAMPLE(*inptr01++);
+ y = *inptr01++;
outptr1[RGB_RED] = range_limit[y + cred];
outptr1[RGB_GREEN] = range_limit[y + cgreen];
outptr1[RGB_BLUE] = range_limit[y + cblue];
@@ -152,7 +150,7 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
outptr1[RGB_ALPHA] = 0xFF;
#endif
outptr1 += RGB_PIXELSIZE;
- y = GETJSAMPLE(*inptr01++);
+ y = *inptr01++;
outptr1[RGB_RED] = range_limit[y + cred];
outptr1[RGB_GREEN] = range_limit[y + cgreen];
outptr1[RGB_BLUE] = range_limit[y + cblue];
@@ -163,19 +161,19 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
}
/* If image width is odd, do the last output column separately */
if (cinfo->output_width & 1) {
- cb = GETJSAMPLE(*inptr1);
- cr = GETJSAMPLE(*inptr2);
+ cb = *inptr1;
+ cr = *inptr2;
cred = Crrtab[cr];
- cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
- y = GETJSAMPLE(*inptr00);
+ y = *inptr00;
outptr0[RGB_RED] = range_limit[y + cred];
outptr0[RGB_GREEN] = range_limit[y + cgreen];
outptr0[RGB_BLUE] = range_limit[y + cblue];
#ifdef RGB_ALPHA
outptr0[RGB_ALPHA] = 0xFF;
#endif
- y = GETJSAMPLE(*inptr01);
+ y = *inptr01;
outptr1[RGB_RED] = range_limit[y + cred];
outptr1[RGB_GREEN] = range_limit[y + cgreen];
outptr1[RGB_BLUE] = range_limit[y + cblue];
diff --git a/media/libjpeg/jdphuff.c b/media/libjpeg/jdphuff.c
index c927ffa071..9680ebcbd0 100644
--- a/media/libjpeg/jdphuff.c
+++ b/media/libjpeg/jdphuff.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -15,12 +15,16 @@
* up to the start of the current MCU. To do this, we copy state variables
* into local working storage, and update them back to the permanent
* storage only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdhuff.h" /* Declarations shared with jdhuff.c */
+#include <limits.h>
#ifdef D_PROGRESSIVE_SUPPORTED
@@ -37,25 +41,6 @@ typedef struct {
int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
} savable_state;
-/* This macro is to work around compilers with missing or broken
- * structure assignment. You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src) ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src) \
- ((dest).EOBRUN = (src).EOBRUN, \
- (dest).last_dc_val[0] = (src).last_dc_val[0], \
- (dest).last_dc_val[1] = (src).last_dc_val[1], \
- (dest).last_dc_val[2] = (src).last_dc_val[2], \
- (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
typedef struct {
struct jpeg_entropy_decoder pub; /* public fields */
@@ -77,14 +62,14 @@ typedef struct {
typedef phuff_entropy_decoder *phuff_entropy_ptr;
/* Forward declarations */
-METHODDEF(boolean) decode_mcu_DC_first (j_decompress_ptr cinfo,
+METHODDEF(boolean) decode_mcu_DC_first(j_decompress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_AC_first(j_decompress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_DC_refine(j_decompress_ptr cinfo,
JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_AC_first (j_decompress_ptr cinfo,
+METHODDEF(boolean) decode_mcu_AC_refine(j_decompress_ptr cinfo,
JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_DC_refine (j_decompress_ptr cinfo,
- JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo,
- JBLOCKROW *MCU_data);
/*
@@ -92,13 +77,13 @@ METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-start_pass_phuff_decoder (j_decompress_ptr cinfo)
+start_pass_phuff_decoder(j_decompress_ptr cinfo)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
boolean is_DC_band, bad;
int ci, coefi, tbl;
d_derived_tbl **pdtbl;
- int *coef_bit_ptr;
+ int *coef_bit_ptr, *prev_coef_bit_ptr;
jpeg_component_info *compptr;
is_DC_band = (cinfo->Ss == 0);
@@ -118,7 +103,7 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
}
if (cinfo->Ah != 0) {
/* Successive approximation refinement scan: must have Al = Ah-1. */
- if (cinfo->Al != cinfo->Ah-1)
+ if (cinfo->Al != cinfo->Ah - 1)
bad = TRUE;
}
if (cinfo->Al > 13) /* need not check for < 0 */
@@ -138,9 +123,16 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
*/
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
int cindex = cinfo->cur_comp_info[ci]->component_index;
- coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+ coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+ prev_coef_bit_ptr = &cinfo->coef_bits[cindex + cinfo->num_components][0];
if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+ for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+ if (cinfo->input_scan_number > 1)
+ prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+ else
+ prev_coef_bit_ptr[coefi] = 0;
+ }
for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
if (cinfo->Ah != expected)
@@ -205,22 +197,26 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
#define AVOID_TABLES
#ifdef AVOID_TABLES
-#define NEG_1 ((unsigned)-1)
-#define HUFF_EXTEND(x,s) ((x) < (1<<((s)-1)) ? (x) + (((NEG_1)<<(s)) + 1) : (x))
+#define NEG_1 ((unsigned)-1)
+#define HUFF_EXTEND(x, s) \
+ ((x) < (1 << ((s) - 1)) ? (x) + (((NEG_1) << (s)) + 1) : (x))
#else
-#define HUFF_EXTEND(x,s) ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+#define HUFF_EXTEND(x, s) \
+ ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-static const int extend_test[16] = /* entry n is 2**(n-1) */
- { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
- 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int extend_test[16] = { /* entry n is 2**(n-1) */
+ 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+ 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
- { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
- ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
- ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
- ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+ 0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+ ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+ ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+ ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
#endif /* AVOID_TABLES */
@@ -231,9 +227,9 @@ static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
*/
LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
int ci;
/* Throw away any unused bits remaining in bit buffer; */
@@ -242,7 +238,7 @@ process_restart (j_decompress_ptr cinfo)
entropy->bitstate.bits_left = 0;
/* Advance past the RSTn marker */
- if (! (*cinfo->marker->read_restart_marker) (cinfo))
+ if (!(*cinfo->marker->read_restart_marker) (cinfo))
return FALSE;
/* Re-initialize DC predictions to 0 */
@@ -289,9 +285,9 @@ process_restart (j_decompress_ptr cinfo)
*/
METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
int Al = cinfo->Al;
register int s, r;
int blkn, ci;
@@ -304,18 +300,18 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
/* Process restart marker if needed; may have to suspend */
if (cinfo->restart_interval) {
if (entropy->restarts_to_go == 0)
- if (! process_restart(cinfo))
+ if (!process_restart(cinfo))
return FALSE;
}
/* If we've run out of data, just leave the MCU set to zeroes.
* This way, we return uniform gray for the remainder of the segment.
*/
- if (! entropy->pub.insufficient_data) {
+ if (!entropy->pub.insufficient_data) {
/* Load up working state */
- BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
- ASSIGN_STATE(state, entropy->saved);
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+ state = entropy->saved;
/* Outer loop handles each block in the MCU */
@@ -336,19 +332,24 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
}
/* Convert DC difference to actual value, update last_dc_val */
+ if ((state.last_dc_val[ci] >= 0 &&
+ s > INT_MAX - state.last_dc_val[ci]) ||
+ (state.last_dc_val[ci] < 0 && s < INT_MIN - state.last_dc_val[ci]))
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF);
s += state.last_dc_val[ci];
state.last_dc_val[ci] = s;
/* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
- (*block)[0] = (JCOEF) LEFT_SHIFT(s, Al);
+ (*block)[0] = (JCOEF)LEFT_SHIFT(s, Al);
}
/* Completed MCU, so update state */
- BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
- ASSIGN_STATE(entropy->saved, state);
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+ entropy->saved = state;
}
/* Account for restart interval (no-op if not using restarts) */
- entropy->restarts_to_go--;
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
return TRUE;
}
@@ -360,9 +361,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
int Se = cinfo->Se;
int Al = cinfo->Al;
register int s, k, r;
@@ -374,14 +375,14 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
/* Process restart marker if needed; may have to suspend */
if (cinfo->restart_interval) {
if (entropy->restarts_to_go == 0)
- if (! process_restart(cinfo))
+ if (!process_restart(cinfo))
return FALSE;
}
/* If we've run out of data, just leave the MCU set to zeroes.
* This way, we return uniform gray for the remainder of the segment.
*/
- if (! entropy->pub.insufficient_data) {
+ if (!entropy->pub.insufficient_data) {
/* Load up working state.
* We can avoid loading/saving bitread state if in an EOB run.
@@ -393,7 +394,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (EOBRUN > 0) /* if it's a band of zeroes... */
EOBRUN--; /* ...process it now (we do nothing) */
else {
- BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
block = MCU_data[0];
tbl = entropy->ac_derived_tbl;
@@ -407,7 +408,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
r = GET_BITS(s);
s = HUFF_EXTEND(r, s);
/* Scale and output coefficient in natural (dezigzagged) order */
- (*block)[jpeg_natural_order[k]] = (JCOEF) LEFT_SHIFT(s, Al);
+ (*block)[jpeg_natural_order[k]] = (JCOEF)LEFT_SHIFT(s, Al);
} else {
if (r == 15) { /* ZRL */
k += 15; /* skip 15 zeroes in band */
@@ -424,7 +425,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
}
}
- BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
}
/* Completed MCU, so update state */
@@ -432,7 +433,8 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
}
/* Account for restart interval (no-op if not using restarts) */
- entropy->restarts_to_go--;
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
return TRUE;
}
@@ -445,9 +447,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */
int blkn;
JBLOCKROW block;
@@ -456,7 +458,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
/* Process restart marker if needed; may have to suspend */
if (cinfo->restart_interval) {
if (entropy->restarts_to_go == 0)
- if (! process_restart(cinfo))
+ if (!process_restart(cinfo))
return FALSE;
}
@@ -465,7 +467,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
/* Load up working state */
- BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
/* Outer loop handles each block in the MCU */
@@ -480,10 +482,11 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
}
/* Completed MCU, so update state */
- BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
/* Account for restart interval (no-op if not using restarts) */
- entropy->restarts_to_go--;
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
return TRUE;
}
@@ -494,9 +497,9 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
int Se = cinfo->Se;
int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */
int m1 = (NEG_1) << cinfo->Al; /* -1 in the bit position being coded */
@@ -512,16 +515,16 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
/* Process restart marker if needed; may have to suspend */
if (cinfo->restart_interval) {
if (entropy->restarts_to_go == 0)
- if (! process_restart(cinfo))
+ if (!process_restart(cinfo))
return FALSE;
}
/* If we've run out of data, don't modify the MCU.
*/
- if (! entropy->pub.insufficient_data) {
+ if (!entropy->pub.insufficient_data) {
/* Load up working state */
- BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
/* There is always only one block per MCU */
@@ -575,9 +578,9 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (GET_BITS(1)) {
if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
if (*thiscoef >= 0)
- *thiscoef += p1;
+ *thiscoef += (JCOEF)p1;
else
- *thiscoef += m1;
+ *thiscoef += (JCOEF)m1;
}
}
} else {
@@ -589,7 +592,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (s) {
int pos = jpeg_natural_order[k];
/* Output newly nonzero coefficient */
- (*block)[pos] = (JCOEF) s;
+ (*block)[pos] = (JCOEF)s;
/* Remember its position in case we have to suspend */
newnz_pos[num_newnz++] = pos;
}
@@ -609,9 +612,9 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (GET_BITS(1)) {
if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
if (*thiscoef >= 0)
- *thiscoef += p1;
+ *thiscoef += (JCOEF)p1;
else
- *thiscoef += m1;
+ *thiscoef += (JCOEF)m1;
}
}
}
@@ -621,12 +624,13 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
}
/* Completed MCU, so update state */
- BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
}
/* Account for restart interval (no-op if not using restarts) */
- entropy->restarts_to_go--;
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
return TRUE;
@@ -644,16 +648,16 @@ undoit:
*/
GLOBAL(void)
-jinit_phuff_decoder (j_decompress_ptr cinfo)
+jinit_phuff_decoder(j_decompress_ptr cinfo)
{
phuff_entropy_ptr entropy;
int *coef_bit_ptr;
int ci, i;
entropy = (phuff_entropy_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(phuff_entropy_decoder));
- cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+ cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
entropy->pub.start_pass = start_pass_phuff_decoder;
/* Mark derived tables unallocated */
@@ -663,9 +667,10 @@ jinit_phuff_decoder (j_decompress_ptr cinfo)
/* Create progression status table */
cinfo->coef_bits = (int (*)[DCTSIZE2])
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- cinfo->num_components*DCTSIZE2*sizeof(int));
- coef_bit_ptr = & cinfo->coef_bits[0][0];
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ cinfo->num_components * 2 * DCTSIZE2 *
+ sizeof(int));
+ coef_bit_ptr = &cinfo->coef_bits[0][0];
for (ci = 0; ci < cinfo->num_components; ci++)
for (i = 0; i < DCTSIZE2; i++)
*coef_bit_ptr++ = -1;
diff --git a/media/libjpeg/jdpostct.c b/media/libjpeg/jdpostct.c
index 601fc2a792..6a2cf5c1b3 100644
--- a/media/libjpeg/jdpostct.c
+++ b/media/libjpeg/jdpostct.c
@@ -46,22 +46,28 @@ typedef my_post_controller *my_post_ptr;
/* Forward declarations */
-METHODDEF(void) post_process_1pass
- (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_1pass(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
#ifdef QUANT_2PASS_SUPPORTED
-METHODDEF(void) post_process_prepass
- (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail);
-METHODDEF(void) post_process_2pass
- (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_prepass(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_2pass(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
#endif
@@ -70,9 +76,9 @@ METHODDEF(void) post_process_2pass
*/
METHODDEF(void)
-start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_dpost(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
{
- my_post_ptr post = (my_post_ptr) cinfo->post;
+ my_post_ptr post = (my_post_ptr)cinfo->post;
switch (pass_mode) {
case JBUF_PASS_THRU:
@@ -85,8 +91,8 @@ start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
*/
if (post->buffer == NULL) {
post->buffer = (*cinfo->mem->access_virt_sarray)
- ((j_common_ptr) cinfo, post->whole_image,
- (JDIMENSION) 0, post->strip_height, TRUE);
+ ((j_common_ptr)cinfo, post->whole_image,
+ (JDIMENSION)0, post->strip_height, TRUE);
}
} else {
/* For single-pass processing without color quantization,
@@ -123,13 +129,12 @@ start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
*/
METHODDEF(void)
-post_process_1pass (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
- JDIMENSION in_row_groups_avail,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail)
+post_process_1pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
{
- my_post_ptr post = (my_post_ptr) cinfo->post;
+ my_post_ptr post = (my_post_ptr)cinfo->post;
JDIMENSION num_rows, max_rows;
/* Fill the buffer, but not more than what we can dump out in one go. */
@@ -138,12 +143,13 @@ post_process_1pass (j_decompress_ptr cinfo,
if (max_rows > post->strip_height)
max_rows = post->strip_height;
num_rows = 0;
- (*cinfo->upsample->upsample) (cinfo,
- input_buf, in_row_group_ctr, in_row_groups_avail,
- post->buffer, &num_rows, max_rows);
+ (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+ in_row_groups_avail, post->buffer, &num_rows,
+ max_rows);
/* Quantize and emit data. */
- (*cinfo->cquantize->color_quantize) (cinfo,
- post->buffer, output_buf + *out_row_ctr, (int) num_rows);
+ (*cinfo->cquantize->color_quantize) (cinfo, post->buffer,
+ output_buf + *out_row_ctr,
+ (int)num_rows);
*out_row_ctr += num_rows;
}
@@ -155,34 +161,33 @@ post_process_1pass (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-post_process_prepass (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
- JDIMENSION in_row_groups_avail,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail)
+post_process_prepass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
{
- my_post_ptr post = (my_post_ptr) cinfo->post;
+ my_post_ptr post = (my_post_ptr)cinfo->post;
JDIMENSION old_next_row, num_rows;
/* Reposition virtual buffer if at start of strip. */
if (post->next_row == 0) {
post->buffer = (*cinfo->mem->access_virt_sarray)
- ((j_common_ptr) cinfo, post->whole_image,
+ ((j_common_ptr)cinfo, post->whole_image,
post->starting_row, post->strip_height, TRUE);
}
/* Upsample some data (up to a strip height's worth). */
old_next_row = post->next_row;
- (*cinfo->upsample->upsample) (cinfo,
- input_buf, in_row_group_ctr, in_row_groups_avail,
- post->buffer, &post->next_row, post->strip_height);
+ (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+ in_row_groups_avail, post->buffer,
+ &post->next_row, post->strip_height);
/* Allow quantizer to scan new data. No data is emitted, */
/* but we advance out_row_ctr so outer loop can tell when we're done. */
if (post->next_row > old_next_row) {
num_rows = post->next_row - old_next_row;
(*cinfo->cquantize->color_quantize) (cinfo, post->buffer + old_next_row,
- (JSAMPARRAY) NULL, (int) num_rows);
+ (JSAMPARRAY)NULL, (int)num_rows);
*out_row_ctr += num_rows;
}
@@ -199,19 +204,18 @@ post_process_prepass (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-post_process_2pass (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
- JDIMENSION in_row_groups_avail,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail)
+post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
{
- my_post_ptr post = (my_post_ptr) cinfo->post;
+ my_post_ptr post = (my_post_ptr)cinfo->post;
JDIMENSION num_rows, max_rows;
/* Reposition virtual buffer if at start of strip. */
if (post->next_row == 0) {
post->buffer = (*cinfo->mem->access_virt_sarray)
- ((j_common_ptr) cinfo, post->whole_image,
+ ((j_common_ptr)cinfo, post->whole_image,
post->starting_row, post->strip_height, FALSE);
}
@@ -226,9 +230,9 @@ post_process_2pass (j_decompress_ptr cinfo,
num_rows = max_rows;
/* Quantize and emit data. */
- (*cinfo->cquantize->color_quantize) (cinfo,
- post->buffer + post->next_row, output_buf + *out_row_ctr,
- (int) num_rows);
+ (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + post->next_row,
+ output_buf + *out_row_ctr,
+ (int)num_rows);
*out_row_ctr += num_rows;
/* Advance if we filled the strip. */
@@ -247,14 +251,14 @@ post_process_2pass (j_decompress_ptr cinfo,
*/
GLOBAL(void)
-jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_post_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
{
my_post_ptr post;
post = (my_post_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_post_controller));
- cinfo->post = (struct jpeg_d_post_controller *) post;
+ cinfo->post = (struct jpeg_d_post_controller *)post;
post->pub.start_pass = start_pass_dpost;
post->whole_image = NULL; /* flag for no virtual arrays */
post->buffer = NULL; /* flag for no strip buffer */
@@ -265,16 +269,16 @@ jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
* an efficient number of rows for upsampling to return.
* (In the presence of output rescaling, we might want to be smarter?)
*/
- post->strip_height = (JDIMENSION) cinfo->max_v_samp_factor;
+ post->strip_height = (JDIMENSION)cinfo->max_v_samp_factor;
if (need_full_buffer) {
/* Two-pass color quantization: need full-image storage. */
/* We round up the number of rows to a multiple of the strip height. */
#ifdef QUANT_2PASS_SUPPORTED
post->whole_image = (*cinfo->mem->request_virt_sarray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
cinfo->output_width * cinfo->out_color_components,
- (JDIMENSION) jround_up((long) cinfo->output_height,
- (long) post->strip_height),
+ (JDIMENSION)jround_up((long)cinfo->output_height,
+ (long)post->strip_height),
post->strip_height);
#else
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -282,7 +286,7 @@ jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
} else {
/* One-pass color quantization: just make a strip buffer. */
post->buffer = (*cinfo->mem->alloc_sarray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
cinfo->output_width * cinfo->out_color_components,
post->strip_height);
}
diff --git a/media/libjpeg/jdsample.c b/media/libjpeg/jdsample.c
index b1378e1512..eaad72a030 100644
--- a/media/libjpeg/jdsample.c
+++ b/media/libjpeg/jdsample.c
@@ -8,6 +8,7 @@
* Copyright (C) 2010, 2015-2016, D. R. Commander.
* Copyright (C) 2014, MIPS Technologies, Inc., California.
* Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2019-2020, Arm Limited.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -36,9 +37,9 @@
*/
METHODDEF(void)
-start_pass_upsample (j_decompress_ptr cinfo)
+start_pass_upsample(j_decompress_ptr cinfo)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
/* Mark the conversion buffer empty */
upsample->next_row_out = cinfo->max_v_samp_factor;
@@ -56,13 +57,12 @@ start_pass_upsample (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-sep_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
- JDIMENSION in_row_groups_avail,
- JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
- JDIMENSION out_rows_avail)
+sep_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
+ JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
int ci;
jpeg_component_info *compptr;
JDIMENSION num_rows;
@@ -84,7 +84,7 @@ sep_upsample (j_decompress_ptr cinfo,
/* Color-convert and emit rows */
/* How many we have in the buffer: */
- num_rows = (JDIMENSION) (cinfo->max_v_samp_factor - upsample->next_row_out);
+ num_rows = (JDIMENSION)(cinfo->max_v_samp_factor - upsample->next_row_out);
/* Not more than the distance to the end of the image. Need this test
* in case the image height is not a multiple of max_v_samp_factor:
*/
@@ -96,9 +96,8 @@ sep_upsample (j_decompress_ptr cinfo,
num_rows = out_rows_avail;
(*cinfo->cconvert->color_convert) (cinfo, upsample->color_buf,
- (JDIMENSION) upsample->next_row_out,
- output_buf + *out_row_ctr,
- (int) num_rows);
+ (JDIMENSION)upsample->next_row_out,
+ output_buf + *out_row_ctr, (int)num_rows);
/* Adjust counts */
*out_row_ctr += num_rows;
@@ -124,8 +123,8 @@ sep_upsample (j_decompress_ptr cinfo,
*/
METHODDEF(void)
-fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+fullsize_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
*output_data_ptr = input_data;
}
@@ -137,8 +136,8 @@ fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+noop_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
*output_data_ptr = NULL; /* safety check */
}
@@ -156,10 +155,10 @@ noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
JSAMPARRAY output_data = *output_data_ptr;
register JSAMPROW inptr, outptr;
register JSAMPLE invalue;
@@ -178,15 +177,15 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
outptr = output_data[outrow];
outend = outptr + cinfo->output_width;
while (outptr < outend) {
- invalue = *inptr++; /* don't need GETJSAMPLE() here */
+ invalue = *inptr++;
for (h = h_expand; h > 0; h--) {
*outptr++ = invalue;
}
}
/* Generate any additional output rows by duplicating the first one */
if (v_expand > 1) {
- jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
- v_expand-1, cinfo->output_width);
+ jcopy_sample_rows(output_data, outrow, output_data, outrow + 1,
+ v_expand - 1, cinfo->output_width);
}
inrow++;
outrow += v_expand;
@@ -200,8 +199,8 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
register JSAMPROW inptr, outptr;
@@ -214,7 +213,7 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
outptr = output_data[inrow];
outend = outptr + cinfo->output_width;
while (outptr < outend) {
- invalue = *inptr++; /* don't need GETJSAMPLE() here */
+ invalue = *inptr++;
*outptr++ = invalue;
*outptr++ = invalue;
}
@@ -228,8 +227,8 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
register JSAMPROW inptr, outptr;
@@ -243,12 +242,12 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
outptr = output_data[outrow];
outend = outptr + cinfo->output_width;
while (outptr < outend) {
- invalue = *inptr++; /* don't need GETJSAMPLE() here */
+ invalue = *inptr++;
*outptr++ = invalue;
*outptr++ = invalue;
}
- jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
- 1, cinfo->output_width);
+ jcopy_sample_rows(output_data, outrow, output_data, outrow + 1, 1,
+ cinfo->output_width);
inrow++;
outrow += 2;
}
@@ -271,8 +270,8 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
register JSAMPROW inptr, outptr;
@@ -284,21 +283,21 @@ h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
inptr = input_data[inrow];
outptr = output_data[inrow];
/* Special case for first column */
- invalue = GETJSAMPLE(*inptr++);
- *outptr++ = (JSAMPLE) invalue;
- *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
+ invalue = *inptr++;
+ *outptr++ = (JSAMPLE)invalue;
+ *outptr++ = (JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
/* General case: 3/4 * nearer pixel + 1/4 * further pixel */
- invalue = GETJSAMPLE(*inptr++) * 3;
- *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
- *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
+ invalue = (*inptr++) * 3;
+ *outptr++ = (JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
+ *outptr++ = (JSAMPLE)((invalue + inptr[0] + 2) >> 2);
}
/* Special case for last column */
- invalue = GETJSAMPLE(*inptr);
- *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
- *outptr++ = (JSAMPLE) invalue;
+ invalue = *inptr;
+ *outptr++ = (JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
+ *outptr++ = (JSAMPLE)invalue;
}
}
@@ -311,15 +310,15 @@ h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr0, inptr1, outptr;
#if BITS_IN_JSAMPLE == 8
- int thiscolsum;
+ int thiscolsum, bias;
#else
- JLONG thiscolsum;
+ JLONG thiscolsum, bias;
#endif
JDIMENSION colctr;
int inrow, outrow, v;
@@ -329,15 +328,18 @@ h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
for (v = 0; v < 2; v++) {
/* inptr0 points to nearest input row, inptr1 points to next nearest */
inptr0 = input_data[inrow];
- if (v == 0) /* next nearest is row above */
- inptr1 = input_data[inrow-1];
- else /* next nearest is row below */
- inptr1 = input_data[inrow+1];
+ if (v == 0) { /* next nearest is row above */
+ inptr1 = input_data[inrow - 1];
+ bias = 1;
+ } else { /* next nearest is row below */
+ inptr1 = input_data[inrow + 1];
+ bias = 2;
+ }
outptr = output_data[outrow++];
- for(colctr = 0; colctr < compptr->downsampled_width; colctr++) {
- thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
- *outptr++ = (JSAMPLE) ((thiscolsum + 1) >> 2);
+ for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
+ thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+ *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
}
}
inrow++;
@@ -354,8 +356,8 @@ h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
register JSAMPROW inptr0, inptr1, outptr;
@@ -373,30 +375,30 @@ h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* inptr0 points to nearest input row, inptr1 points to next nearest */
inptr0 = input_data[inrow];
if (v == 0) /* next nearest is row above */
- inptr1 = input_data[inrow-1];
+ inptr1 = input_data[inrow - 1];
else /* next nearest is row below */
- inptr1 = input_data[inrow+1];
+ inptr1 = input_data[inrow + 1];
outptr = output_data[outrow++];
/* Special case for first column */
- thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
- nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
- *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4);
- *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
- lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+ thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+ nextcolsum = (*inptr0++) * 3 + (*inptr1++);
+ *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
+ *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+ lastcolsum = thiscolsum; thiscolsum = nextcolsum;
for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
/* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
/* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
- nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
- *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
- *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
- lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+ nextcolsum = (*inptr0++) * 3 + (*inptr1++);
+ *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+ *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+ lastcolsum = thiscolsum; thiscolsum = nextcolsum;
}
/* Special case for last column */
- *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
- *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4);
+ *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+ *outptr++ = (JSAMPLE)((thiscolsum * 4 + 7) >> 4);
}
inrow++;
}
@@ -408,7 +410,7 @@ h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jinit_upsampler (j_decompress_ptr cinfo)
+jinit_upsampler(j_decompress_ptr cinfo)
{
my_upsample_ptr upsample;
int ci;
@@ -418,14 +420,14 @@ jinit_upsampler (j_decompress_ptr cinfo)
if (!cinfo->master->jinit_upsampler_no_alloc) {
upsample = (my_upsample_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_upsampler));
- cinfo->upsample = (struct jpeg_upsampler *) upsample;
+ cinfo->upsample = (struct jpeg_upsampler *)upsample;
upsample->pub.start_pass = start_pass_upsample;
upsample->pub.upsample = sep_upsample;
upsample->pub.need_context_rows = FALSE; /* until we find out differently */
} else
- upsample = (my_upsample_ptr) cinfo->upsample;
+ upsample = (my_upsample_ptr)cinfo->upsample;
if (cinfo->CCIR601_sampling) /* this isn't supported */
ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
@@ -451,7 +453,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
v_out_group = cinfo->max_v_samp_factor;
upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
need_buffer = TRUE;
- if (! compptr->component_needed) {
+ if (!compptr->component_needed) {
/* Don't bother to upsample an uninteresting component. */
upsample->methods[ci] = noop_upsample;
need_buffer = FALSE;
@@ -459,8 +461,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
/* Fullsize components can be processed without any work. */
upsample->methods[ci] = fullsize_upsample;
need_buffer = FALSE;
- } else if (h_in_group * 2 == h_out_group &&
- v_in_group == v_out_group) {
+ } else if (h_in_group * 2 == h_out_group && v_in_group == v_out_group) {
/* Special cases for 2h1v upsampling */
if (do_fancy && compptr->downsampled_width > 2) {
if (jsimd_can_h2v1_fancy_upsample())
@@ -476,7 +477,13 @@ jinit_upsampler (j_decompress_ptr cinfo)
} else if (h_in_group == h_out_group &&
v_in_group * 2 == v_out_group && do_fancy) {
/* Non-fancy upsampling is handled by the generic method */
- upsample->methods[ci] = h1v2_fancy_upsample;
+#if defined(__arm__) || defined(__aarch64__) || \
+ defined(_M_ARM) || defined(_M_ARM64)
+ if (jsimd_can_h1v2_fancy_upsample())
+ upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
+ else
+#endif
+ upsample->methods[ci] = h1v2_fancy_upsample;
upsample->pub.need_context_rows = TRUE;
} else if (h_in_group * 2 == h_out_group &&
v_in_group * 2 == v_out_group) {
@@ -502,16 +509,16 @@ jinit_upsampler (j_decompress_ptr cinfo)
else
#endif
upsample->methods[ci] = int_upsample;
- upsample->h_expand[ci] = (UINT8) (h_out_group / h_in_group);
- upsample->v_expand[ci] = (UINT8) (v_out_group / v_in_group);
+ upsample->h_expand[ci] = (UINT8)(h_out_group / h_in_group);
+ upsample->v_expand[ci] = (UINT8)(v_out_group / v_in_group);
} else
ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
if (need_buffer && !cinfo->master->jinit_upsampler_no_alloc) {
upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (JDIMENSION) jround_up((long) cinfo->output_width,
- (long) cinfo->max_h_samp_factor),
- (JDIMENSION) cinfo->max_v_samp_factor);
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)jround_up((long)cinfo->output_width,
+ (long)cinfo->max_h_samp_factor),
+ (JDIMENSION)cinfo->max_v_samp_factor);
}
}
}
diff --git a/media/libjpeg/jdtrans.c b/media/libjpeg/jdtrans.c
index cfc85dd24c..d7ec4b83b3 100644
--- a/media/libjpeg/jdtrans.c
+++ b/media/libjpeg/jdtrans.c
@@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -16,10 +16,11 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
+#include "jpegcomp.h"
/* Forward declarations */
-LOCAL(void) transdecode_master_selection (j_decompress_ptr cinfo);
+LOCAL(void) transdecode_master_selection(j_decompress_ptr cinfo);
/*
@@ -45,7 +46,7 @@ LOCAL(void) transdecode_master_selection (j_decompress_ptr cinfo);
*/
GLOBAL(jvirt_barray_ptr *)
-jpeg_read_coefficients (j_decompress_ptr cinfo)
+jpeg_read_coefficients(j_decompress_ptr cinfo)
{
if (cinfo->global_state == DSTATE_READY) {
/* First call: initialize active modules */
@@ -58,7 +59,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo)
int retcode;
/* Call progress monitor hook if present */
if (cinfo->progress != NULL)
- (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
/* Absorb some more input */
retcode = (*cinfo->inputctl->consume_input) (cinfo);
if (retcode == JPEG_SUSPENDED)
@@ -70,7 +71,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo)
(retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
/* startup underestimated number of scans; ratchet up one scan */
- cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+ cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
}
}
}
@@ -97,7 +98,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo)
*/
LOCAL(void)
-transdecode_master_selection (j_decompress_ptr cinfo)
+transdecode_master_selection(j_decompress_ptr cinfo)
{
/* This is effectively a buffered-image operation. */
cinfo->buffered_image = TRUE;
@@ -129,7 +130,7 @@ transdecode_master_selection (j_decompress_ptr cinfo)
jinit_d_coef_controller(cinfo, TRUE);
/* We can now tell the memory manager to allocate virtual arrays. */
- (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+ (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
/* Initialize input side of decompressor to consume first scan. */
(*cinfo->inputctl->start_input_pass) (cinfo);
@@ -148,7 +149,7 @@ transdecode_master_selection (j_decompress_ptr cinfo)
nscans = 1;
}
cinfo->progress->pass_counter = 0L;
- cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+ cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
cinfo->progress->completed_passes = 0;
cinfo->progress->total_passes = 1;
}
diff --git a/media/libjpeg/jerror.c b/media/libjpeg/jerror.c
index c31acd9ef0..d544702937 100644
--- a/media/libjpeg/jerror.c
+++ b/media/libjpeg/jerror.c
@@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -44,7 +44,7 @@
* want to refer to it directly.
*/
-#define JMESSAGE(code,string) string ,
+#define JMESSAGE(code, string) string,
const char * const jpeg_std_message_table[] = {
#include "jerror.h"
@@ -66,7 +66,7 @@ const char * const jpeg_std_message_table[] = {
*/
METHODDEF(void)
-error_exit (j_common_ptr cinfo)
+error_exit(j_common_ptr cinfo)
{
/* Always display the message */
(*cinfo->err->output_message) (cinfo);
@@ -94,7 +94,7 @@ error_exit (j_common_ptr cinfo)
*/
METHODDEF(void)
-output_message (j_common_ptr cinfo)
+output_message(j_common_ptr cinfo)
{
char buffer[JMSG_LENGTH_MAX];
@@ -124,7 +124,7 @@ output_message (j_common_ptr cinfo)
*/
METHODDEF(void)
-emit_message (j_common_ptr cinfo, int msg_level)
+emit_message(j_common_ptr cinfo, int msg_level)
{
struct jpeg_error_mgr *err = cinfo->err;
@@ -153,7 +153,7 @@ emit_message (j_common_ptr cinfo, int msg_level)
*/
METHODDEF(void)
-format_message (j_common_ptr cinfo, char *buffer)
+format_message(j_common_ptr cinfo, char *buffer)
{
struct jpeg_error_mgr *err = cinfo->err;
int msg_code = err->msg_code;
@@ -189,13 +189,13 @@ format_message (j_common_ptr cinfo, char *buffer)
/* Format the message into the passed buffer */
if (isstring)
- sprintf(buffer, msgtext, err->msg_parm.s);
+ snprintf(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s);
else
- sprintf(buffer, msgtext,
- err->msg_parm.i[0], err->msg_parm.i[1],
- err->msg_parm.i[2], err->msg_parm.i[3],
- err->msg_parm.i[4], err->msg_parm.i[5],
- err->msg_parm.i[6], err->msg_parm.i[7]);
+ snprintf(buffer, JMSG_LENGTH_MAX, msgtext,
+ err->msg_parm.i[0], err->msg_parm.i[1],
+ err->msg_parm.i[2], err->msg_parm.i[3],
+ err->msg_parm.i[4], err->msg_parm.i[5],
+ err->msg_parm.i[6], err->msg_parm.i[7]);
}
@@ -208,7 +208,7 @@ format_message (j_common_ptr cinfo, char *buffer)
*/
METHODDEF(void)
-reset_error_mgr (j_common_ptr cinfo)
+reset_error_mgr(j_common_ptr cinfo)
{
cinfo->err->num_warnings = 0;
/* trace_level is not reset since it is an application-supplied parameter */
@@ -227,7 +227,7 @@ reset_error_mgr (j_common_ptr cinfo)
*/
GLOBAL(struct jpeg_error_mgr *)
-jpeg_std_error (struct jpeg_error_mgr *err)
+jpeg_std_error(struct jpeg_error_mgr *err)
{
err->error_exit = error_exit;
err->emit_message = emit_message;
@@ -241,7 +241,7 @@ jpeg_std_error (struct jpeg_error_mgr *err)
/* Initialize message table pointers */
err->jpeg_message_table = jpeg_std_message_table;
- err->last_jpeg_message = (int) JMSG_LASTMSGCODE - 1;
+ err->last_jpeg_message = (int)JMSG_LASTMSGCODE - 1;
err->addon_message_table = NULL;
err->first_addon_message = 0; /* for safety */
diff --git a/media/libjpeg/jerror.h b/media/libjpeg/jerror.h
index 11a07cb5d0..eb44a1140a 100644
--- a/media/libjpeg/jerror.h
+++ b/media/libjpeg/jerror.h
@@ -5,7 +5,7 @@
* Copyright (C) 1994-1997, Thomas G. Lane.
* Modified 1997-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014, 2017, 2021-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -28,7 +28,7 @@
#define JMAKE_ENUM_LIST
#else
/* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */
-#define JMESSAGE(code,string)
+#define JMESSAGE(code, string)
#endif /* JERROR_H */
#endif /* JMESSAGE */
@@ -36,7 +36,7 @@
typedef enum {
-#define JMESSAGE(code,string) code ,
+#define JMESSAGE(code, string) code,
#endif /* JMAKE_ENUM_LIST */
@@ -44,8 +44,7 @@ JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
/* For maintenance convenience, list is alphabetical by message code name */
#if JPEG_LIB_VERSION < 70
-JMESSAGE(JERR_ARITH_NOTIMPL,
- "Sorry, arithmetic coding is not implemented")
+JMESSAGE(JERR_ARITH_NOTIMPL, "Sorry, arithmetic coding is not implemented")
#endif
JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
@@ -104,7 +103,7 @@ JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
"Cannot transcode due to multiple use of quantization table %d")
JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
-JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
+JMESSAGE(JERR_NOTIMPL, "Requested features are incompatible")
JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
#if JPEG_LIB_VERSION >= 70
JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
@@ -154,8 +153,7 @@ JMESSAGE(JTRC_HUFFBITS, " %3d %3d %3d %3d %3d %3d %3d %3d")
JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d %d")
JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE,
"Warning: thumbnail image size does not match data length %u")
-JMESSAGE(JTRC_JFIF_EXTENSION,
- "JFIF extension marker: type 0x%02x, length %u")
+JMESSAGE(JTRC_JFIF_EXTENSION, "JFIF extension marker: type 0x%02x, length %u")
JMESSAGE(JTRC_JFIF_THUMBNAIL, " with %d x %d thumbnail image")
JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u")
JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x")
@@ -208,6 +206,11 @@ JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
#endif
#endif
+JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+ "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
#ifdef JMAKE_ENUM_LIST
@@ -228,90 +231,101 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
/* The first parameter is either type of cinfo pointer */
/* Fatal errors (print message and exit) */
-#define ERREXIT(cinfo,code) \
+#define ERREXIT(cinfo, code) \
((cinfo)->err->msg_code = (code), \
- (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT1(cinfo,code,p1) \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT1(cinfo, code, p1) \
((cinfo)->err->msg_code = (code), \
(cinfo)->err->msg_parm.i[0] = (p1), \
- (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT2(cinfo,code,p1,p2) \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT2(cinfo, code, p1, p2) \
((cinfo)->err->msg_code = (code), \
(cinfo)->err->msg_parm.i[0] = (p1), \
(cinfo)->err->msg_parm.i[1] = (p2), \
- (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT3(cinfo,code,p1,p2,p3) \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT3(cinfo, code, p1, p2, p3) \
((cinfo)->err->msg_code = (code), \
(cinfo)->err->msg_parm.i[0] = (p1), \
(cinfo)->err->msg_parm.i[1] = (p2), \
(cinfo)->err->msg_parm.i[2] = (p3), \
- (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT4(cinfo,code,p1,p2,p3,p4) \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT4(cinfo, code, p1, p2, p3, p4) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (cinfo)->err->msg_parm.i[1] = (p2), \
+ (cinfo)->err->msg_parm.i[2] = (p3), \
+ (cinfo)->err->msg_parm.i[3] = (p4), \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT6(cinfo, code, p1, p2, p3, p4, p5, p6) \
((cinfo)->err->msg_code = (code), \
(cinfo)->err->msg_parm.i[0] = (p1), \
(cinfo)->err->msg_parm.i[1] = (p2), \
(cinfo)->err->msg_parm.i[2] = (p3), \
(cinfo)->err->msg_parm.i[3] = (p4), \
- (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXITS(cinfo,code,str) \
+ (cinfo)->err->msg_parm.i[4] = (p5), \
+ (cinfo)->err->msg_parm.i[5] = (p6), \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXITS(cinfo, code, str) \
((cinfo)->err->msg_code = (code), \
strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
- (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+ (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
#define MAKESTMT(stuff) do { stuff } while (0)
/* Nonfatal errors (we can keep going, but the data is probably corrupt) */
-#define WARNMS(cinfo,code) \
+#define WARNMS(cinfo, code) \
((cinfo)->err->msg_code = (code), \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
-#define WARNMS1(cinfo,code,p1) \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS1(cinfo, code, p1) \
((cinfo)->err->msg_code = (code), \
(cinfo)->err->msg_parm.i[0] = (p1), \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
-#define WARNMS2(cinfo,code,p1,p2) \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS2(cinfo, code, p1, p2) \
((cinfo)->err->msg_code = (code), \
(cinfo)->err->msg_parm.i[0] = (p1), \
(cinfo)->err->msg_parm.i[1] = (p2), \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
/* Informational/debugging messages */
-#define TRACEMS(cinfo,lvl,code) \
+#define TRACEMS(cinfo, lvl, code) \
((cinfo)->err->msg_code = (code), \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS1(cinfo,lvl,code,p1) \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS1(cinfo, lvl, code, p1) \
((cinfo)->err->msg_code = (code), \
(cinfo)->err->msg_parm.i[0] = (p1), \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS2(cinfo,lvl,code,p1,p2) \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS2(cinfo, lvl, code, p1, p2) \
((cinfo)->err->msg_code = (code), \
(cinfo)->err->msg_parm.i[0] = (p1), \
(cinfo)->err->msg_parm.i[1] = (p2), \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS3(cinfo,lvl,code,p1,p2,p3) \
- MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
- _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS3(cinfo, lvl, code, p1, p2, p3) \
+ MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+ _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \
(cinfo)->err->msg_code = (code); \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS4(cinfo,lvl,code,p1,p2,p3,p4) \
- MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
- _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS4(cinfo, lvl, code, p1, p2, p3, p4) \
+ MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+ _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
(cinfo)->err->msg_code = (code); \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS5(cinfo,lvl,code,p1,p2,p3,p4,p5) \
- MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
- _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS5(cinfo, lvl, code, p1, p2, p3, p4, p5) \
+ MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+ _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
_mp[4] = (p5); \
(cinfo)->err->msg_code = (code); \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS8(cinfo,lvl,code,p1,p2,p3,p4,p5,p6,p7,p8) \
- MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
- _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
- _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS8(cinfo, lvl, code, p1, p2, p3, p4, p5, p6, p7, p8) \
+ MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+ _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+ _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \
(cinfo)->err->msg_code = (code); \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMSS(cinfo,lvl,code,str) \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMSS(cinfo, lvl, code, str) \
((cinfo)->err->msg_code = (code), \
strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
- (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
+ (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
#endif /* JERROR_H */
diff --git a/media/libjpeg/jfdctflt.c b/media/libjpeg/jfdctflt.c
index b3da3ebda8..ab6f6d0825 100644
--- a/media/libjpeg/jfdctflt.c
+++ b/media/libjpeg/jfdctflt.c
@@ -57,7 +57,7 @@
*/
GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT *data)
+jpeg_fdct_float(FAST_FLOAT *data)
{
FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -68,7 +68,7 @@ jpeg_fdct_float (FAST_FLOAT *data)
/* Pass 1: process rows. */
dataptr = data;
- for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
tmp0 = dataptr[0] + dataptr[7];
tmp7 = dataptr[0] - dataptr[7];
tmp1 = dataptr[1] + dataptr[6];
@@ -88,7 +88,7 @@ jpeg_fdct_float (FAST_FLOAT *data)
dataptr[0] = tmp10 + tmp11; /* phase 3 */
dataptr[4] = tmp10 - tmp11;
- z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
+ z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
dataptr[2] = tmp13 + z1; /* phase 5 */
dataptr[6] = tmp13 - z1;
@@ -99,10 +99,10 @@ jpeg_fdct_float (FAST_FLOAT *data)
tmp12 = tmp6 + tmp7;
/* The rotator is modified from fig 4-8 to avoid extra negations. */
- z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */
- z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */
- z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
- z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
+ z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+ z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+ z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+ z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
z11 = tmp7 + z3; /* phase 5 */
z13 = tmp7 - z3;
@@ -118,15 +118,15 @@ jpeg_fdct_float (FAST_FLOAT *data)
/* Pass 2: process columns. */
dataptr = data;
- for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
- tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
- tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
- tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
- tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
- tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
- tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
- tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
- tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+ tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+ tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+ tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+ tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+ tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+ tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+ tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+ tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
/* Even part */
@@ -135,12 +135,12 @@ jpeg_fdct_float (FAST_FLOAT *data)
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
- dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
- dataptr[DCTSIZE*4] = tmp10 - tmp11;
+ dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+ dataptr[DCTSIZE * 4] = tmp10 - tmp11;
- z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
- dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
- dataptr[DCTSIZE*6] = tmp13 - z1;
+ z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
+ dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+ dataptr[DCTSIZE * 6] = tmp13 - z1;
/* Odd part */
@@ -149,18 +149,18 @@ jpeg_fdct_float (FAST_FLOAT *data)
tmp12 = tmp6 + tmp7;
/* The rotator is modified from fig 4-8 to avoid extra negations. */
- z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */
- z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */
- z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
- z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
+ z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+ z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+ z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+ z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
z11 = tmp7 + z3; /* phase 5 */
z13 = tmp7 - z3;
- dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
- dataptr[DCTSIZE*3] = z13 - z2;
- dataptr[DCTSIZE*1] = z11 + z4;
- dataptr[DCTSIZE*7] = z11 - z4;
+ dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+ dataptr[DCTSIZE * 3] = z13 - z2;
+ dataptr[DCTSIZE * 1] = z11 + z4;
+ dataptr[DCTSIZE * 7] = z11 - z4;
dataptr++; /* advance pointer to next column */
}
diff --git a/media/libjpeg/jfdctfst.c b/media/libjpeg/jfdctfst.c
index 5cd83a7b8e..4c9ce0de8f 100644
--- a/media/libjpeg/jfdctfst.c
+++ b/media/libjpeg/jfdctfst.c
@@ -79,10 +79,10 @@
*/
#if CONST_BITS == 8
-#define FIX_0_382683433 ((JLONG) 98) /* FIX(0.382683433) */
-#define FIX_0_541196100 ((JLONG) 139) /* FIX(0.541196100) */
-#define FIX_0_707106781 ((JLONG) 181) /* FIX(0.707106781) */
-#define FIX_1_306562965 ((JLONG) 334) /* FIX(1.306562965) */
+#define FIX_0_382683433 ((JLONG)98) /* FIX(0.382683433) */
+#define FIX_0_541196100 ((JLONG)139) /* FIX(0.541196100) */
+#define FIX_0_707106781 ((JLONG)181) /* FIX(0.707106781) */
+#define FIX_1_306562965 ((JLONG)334) /* FIX(1.306562965) */
#else
#define FIX_0_382683433 FIX(0.382683433)
#define FIX_0_541196100 FIX(0.541196100)
@@ -98,7 +98,7 @@
#ifndef USE_ACCURATE_ROUNDING
#undef DESCALE
-#define DESCALE(x,n) RIGHT_SHIFT(x, n)
+#define DESCALE(x, n) RIGHT_SHIFT(x, n)
#endif
@@ -106,7 +106,7 @@
* descale to yield a DCTELEM result.
*/
-#define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+#define MULTIPLY(var, const) ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
/*
@@ -114,7 +114,7 @@
*/
GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM *data)
+jpeg_fdct_ifast(DCTELEM *data)
{
DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -126,7 +126,7 @@ jpeg_fdct_ifast (DCTELEM *data)
/* Pass 1: process rows. */
dataptr = data;
- for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
tmp0 = dataptr[0] + dataptr[7];
tmp7 = dataptr[0] - dataptr[7];
tmp1 = dataptr[1] + dataptr[6];
@@ -176,15 +176,15 @@ jpeg_fdct_ifast (DCTELEM *data)
/* Pass 2: process columns. */
dataptr = data;
- for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
- tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
- tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
- tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
- tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
- tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
- tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
- tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
- tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+ tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+ tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+ tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+ tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+ tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+ tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+ tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+ tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
/* Even part */
@@ -193,12 +193,12 @@ jpeg_fdct_ifast (DCTELEM *data)
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
- dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
- dataptr[DCTSIZE*4] = tmp10 - tmp11;
+ dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+ dataptr[DCTSIZE * 4] = tmp10 - tmp11;
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
- dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
- dataptr[DCTSIZE*6] = tmp13 - z1;
+ dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+ dataptr[DCTSIZE * 6] = tmp13 - z1;
/* Odd part */
@@ -215,10 +215,10 @@ jpeg_fdct_ifast (DCTELEM *data)
z11 = tmp7 + z3; /* phase 5 */
z13 = tmp7 - z3;
- dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
- dataptr[DCTSIZE*3] = z13 - z2;
- dataptr[DCTSIZE*1] = z11 + z4;
- dataptr[DCTSIZE*7] = z11 - z4;
+ dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+ dataptr[DCTSIZE * 3] = z13 - z2;
+ dataptr[DCTSIZE * 1] = z11 + z4;
+ dataptr[DCTSIZE * 7] = z11 - z4;
dataptr++; /* advance pointer to next column */
}
diff --git a/media/libjpeg/jfdctint.c b/media/libjpeg/jfdctint.c
index 169bb942ce..c95a3a7fb8 100644
--- a/media/libjpeg/jfdctint.c
+++ b/media/libjpeg/jfdctint.c
@@ -1,14 +1,14 @@
/*
* jfdctint.c
*
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1996, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
- * This file contains a slow-but-accurate integer implementation of the
+ * This file contains a slower but more accurate integer implementation of the
* forward DCT (Discrete Cosine Transform).
*
* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
@@ -93,18 +93,18 @@
*/
#if CONST_BITS == 13
-#define FIX_0_298631336 ((JLONG) 2446) /* FIX(0.298631336) */
-#define FIX_0_390180644 ((JLONG) 3196) /* FIX(0.390180644) */
-#define FIX_0_541196100 ((JLONG) 4433) /* FIX(0.541196100) */
-#define FIX_0_765366865 ((JLONG) 6270) /* FIX(0.765366865) */
-#define FIX_0_899976223 ((JLONG) 7373) /* FIX(0.899976223) */
-#define FIX_1_175875602 ((JLONG) 9633) /* FIX(1.175875602) */
-#define FIX_1_501321110 ((JLONG) 12299) /* FIX(1.501321110) */
-#define FIX_1_847759065 ((JLONG) 15137) /* FIX(1.847759065) */
-#define FIX_1_961570560 ((JLONG) 16069) /* FIX(1.961570560) */
-#define FIX_2_053119869 ((JLONG) 16819) /* FIX(2.053119869) */
-#define FIX_2_562915447 ((JLONG) 20995) /* FIX(2.562915447) */
-#define FIX_3_072711026 ((JLONG) 25172) /* FIX(3.072711026) */
+#define FIX_0_298631336 ((JLONG)2446) /* FIX(0.298631336) */
+#define FIX_0_390180644 ((JLONG)3196) /* FIX(0.390180644) */
+#define FIX_0_541196100 ((JLONG)4433) /* FIX(0.541196100) */
+#define FIX_0_765366865 ((JLONG)6270) /* FIX(0.765366865) */
+#define FIX_0_899976223 ((JLONG)7373) /* FIX(0.899976223) */
+#define FIX_1_175875602 ((JLONG)9633) /* FIX(1.175875602) */
+#define FIX_1_501321110 ((JLONG)12299) /* FIX(1.501321110) */
+#define FIX_1_847759065 ((JLONG)15137) /* FIX(1.847759065) */
+#define FIX_1_961570560 ((JLONG)16069) /* FIX(1.961570560) */
+#define FIX_2_053119869 ((JLONG)16819) /* FIX(2.053119869) */
+#define FIX_2_562915447 ((JLONG)20995) /* FIX(2.562915447) */
+#define FIX_3_072711026 ((JLONG)25172) /* FIX(3.072711026) */
#else
#define FIX_0_298631336 FIX(0.298631336)
#define FIX_0_390180644 FIX(0.390180644)
@@ -129,9 +129,9 @@
*/
#if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const) MULTIPLY16C16(var, const)
#else
-#define MULTIPLY(var,const) ((var) * (const))
+#define MULTIPLY(var, const) ((var) * (const))
#endif
@@ -140,7 +140,7 @@
*/
GLOBAL(void)
-jpeg_fdct_islow (DCTELEM *data)
+jpeg_fdct_islow(DCTELEM *data)
{
JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
JLONG tmp10, tmp11, tmp12, tmp13;
@@ -154,7 +154,7 @@ jpeg_fdct_islow (DCTELEM *data)
/* furthermore, we scale the results by 2**PASS1_BITS. */
dataptr = data;
- for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
tmp0 = dataptr[0] + dataptr[7];
tmp7 = dataptr[0] - dataptr[7];
tmp1 = dataptr[1] + dataptr[6];
@@ -173,14 +173,14 @@ jpeg_fdct_islow (DCTELEM *data)
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
- dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
- dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+ dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+ dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
- dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
- CONST_BITS-PASS1_BITS);
- dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
- CONST_BITS-PASS1_BITS);
+ dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+ CONST_BITS - PASS1_BITS);
+ dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+ CONST_BITS - PASS1_BITS);
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
* cK represents cos(K*pi/16).
@@ -197,18 +197,18 @@ jpeg_fdct_islow (DCTELEM *data)
tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+ z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
z3 += z5;
z4 += z5;
- dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
- dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
- dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
- dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
+ dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+ dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+ dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+ dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
dataptr += DCTSIZE; /* advance pointer to next row */
}
@@ -219,15 +219,15 @@ jpeg_fdct_islow (DCTELEM *data)
*/
dataptr = data;
- for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
- tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
- tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
- tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
- tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
- tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
- tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
- tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
- tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+ tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+ tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+ tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+ tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+ tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+ tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+ tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+ tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
/* Even part per LL&M figure 1 --- note that published figure is faulty;
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
@@ -238,14 +238,16 @@ jpeg_fdct_islow (DCTELEM *data)
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
- dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
- dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
+ dataptr[DCTSIZE * 0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS);
+ dataptr[DCTSIZE * 4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS);
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
- dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
- CONST_BITS+PASS1_BITS);
- dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
- CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE * 2] =
+ (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+ CONST_BITS + PASS1_BITS);
+ dataptr[DCTSIZE * 6] =
+ (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+ CONST_BITS + PASS1_BITS);
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
* cK represents cos(K*pi/16).
@@ -262,22 +264,22 @@ jpeg_fdct_islow (DCTELEM *data)
tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+ z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
z3 += z5;
z4 += z5;
- dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
- CONST_BITS+PASS1_BITS);
- dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
- CONST_BITS+PASS1_BITS);
- dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
- CONST_BITS+PASS1_BITS);
- dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
- CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE * 7] = (DCTELEM)DESCALE(tmp4 + z1 + z3,
+ CONST_BITS + PASS1_BITS);
+ dataptr[DCTSIZE * 5] = (DCTELEM)DESCALE(tmp5 + z2 + z4,
+ CONST_BITS + PASS1_BITS);
+ dataptr[DCTSIZE * 3] = (DCTELEM)DESCALE(tmp6 + z2 + z3,
+ CONST_BITS + PASS1_BITS);
+ dataptr[DCTSIZE * 1] = (DCTELEM)DESCALE(tmp7 + z1 + z4,
+ CONST_BITS + PASS1_BITS);
dataptr++; /* advance pointer to next column */
}
diff --git a/media/libjpeg/jidctflt.c b/media/libjpeg/jidctflt.c
index 68c521ed7e..5aee74e232 100644
--- a/media/libjpeg/jidctflt.c
+++ b/media/libjpeg/jidctflt.c
@@ -61,7 +61,7 @@
* entry; produce a float result.
*/
-#define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval) (((FAST_FLOAT)(coef)) * (quantval))
/*
@@ -69,9 +69,9 @@
*/
GLOBAL(void)
-jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -83,12 +83,12 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPLE *range_limit = cinfo->sample_range_limit;
int ctr;
FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
- #define _0_125 ((FLOAT_MULT_TYPE)0.125)
+#define _0_125 ((FLOAT_MULT_TYPE)0.125)
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table;
+ quantptr = (FLOAT_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = DCTSIZE; ctr > 0; ctr--) {
/* Due to quantization, we will usually find that many of the input
@@ -100,22 +100,22 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
* column DCT calculations can be simplified this way.
*/
- if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
- inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
- inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
- inptr[DCTSIZE*7] == 0) {
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+ inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+ inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+ inptr[DCTSIZE * 7] == 0) {
/* AC terms all zero */
- FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0],
- quantptr[DCTSIZE*0] * _0_125);
-
- wsptr[DCTSIZE*0] = dcval;
- wsptr[DCTSIZE*1] = dcval;
- wsptr[DCTSIZE*2] = dcval;
- wsptr[DCTSIZE*3] = dcval;
- wsptr[DCTSIZE*4] = dcval;
- wsptr[DCTSIZE*5] = dcval;
- wsptr[DCTSIZE*6] = dcval;
- wsptr[DCTSIZE*7] = dcval;
+ FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE * 0],
+ quantptr[DCTSIZE * 0] * _0_125);
+
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
+ wsptr[DCTSIZE * 2] = dcval;
+ wsptr[DCTSIZE * 3] = dcval;
+ wsptr[DCTSIZE * 4] = dcval;
+ wsptr[DCTSIZE * 5] = dcval;
+ wsptr[DCTSIZE * 6] = dcval;
+ wsptr[DCTSIZE * 7] = dcval;
inptr++; /* advance pointers to next column */
quantptr++;
@@ -125,16 +125,16 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
- tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0] * _0_125);
- tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2] * _0_125);
- tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4] * _0_125);
- tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6] * _0_125);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0] * _0_125);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2] * _0_125);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4] * _0_125);
+ tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6] * _0_125);
tmp10 = tmp0 + tmp2; /* phase 3 */
tmp11 = tmp0 - tmp2;
tmp13 = tmp1 + tmp3; /* phases 5-3 */
- tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */
+ tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT)1.414213562) - tmp13; /* 2*c4 */
tmp0 = tmp10 + tmp13; /* phase 2 */
tmp3 = tmp10 - tmp13;
@@ -143,10 +143,10 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1] * _0_125);
- tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3] * _0_125);
- tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5] * _0_125);
- tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7] * _0_125);
+ tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1] * _0_125);
+ tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3] * _0_125);
+ tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5] * _0_125);
+ tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7] * _0_125);
z13 = tmp6 + tmp5; /* phase 6 */
z10 = tmp6 - tmp5;
@@ -154,24 +154,24 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
z12 = tmp4 - tmp7;
tmp7 = z11 + z13; /* phase 5 */
- tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */
+ tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562); /* 2*c4 */
- z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
- tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
- tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
+ z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+ tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+ tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
tmp6 = tmp12 - tmp7; /* phase 2 */
tmp5 = tmp11 - tmp6;
tmp4 = tmp10 - tmp5;
- wsptr[DCTSIZE*0] = tmp0 + tmp7;
- wsptr[DCTSIZE*7] = tmp0 - tmp7;
- wsptr[DCTSIZE*1] = tmp1 + tmp6;
- wsptr[DCTSIZE*6] = tmp1 - tmp6;
- wsptr[DCTSIZE*2] = tmp2 + tmp5;
- wsptr[DCTSIZE*5] = tmp2 - tmp5;
- wsptr[DCTSIZE*3] = tmp3 + tmp4;
- wsptr[DCTSIZE*4] = tmp3 - tmp4;
+ wsptr[DCTSIZE * 0] = tmp0 + tmp7;
+ wsptr[DCTSIZE * 7] = tmp0 - tmp7;
+ wsptr[DCTSIZE * 1] = tmp1 + tmp6;
+ wsptr[DCTSIZE * 6] = tmp1 - tmp6;
+ wsptr[DCTSIZE * 2] = tmp2 + tmp5;
+ wsptr[DCTSIZE * 5] = tmp2 - tmp5;
+ wsptr[DCTSIZE * 3] = tmp3 + tmp4;
+ wsptr[DCTSIZE * 4] = tmp3 - tmp4;
inptr++; /* advance pointers to next column */
quantptr++;
@@ -192,12 +192,12 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Apply signed->unsigned and prepare float->int conversion */
- z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5);
+ z5 = wsptr[0] + ((FAST_FLOAT)CENTERJSAMPLE + (FAST_FLOAT)0.5);
tmp10 = z5 + wsptr[4];
tmp11 = z5 - wsptr[4];
tmp13 = wsptr[2] + wsptr[6];
- tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;
+ tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT)1.414213562) - tmp13;
tmp0 = tmp10 + tmp13;
tmp3 = tmp10 - tmp13;
@@ -212,11 +212,11 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
z12 = wsptr[1] - wsptr[7];
tmp7 = z11 + z13;
- tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);
+ tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562);
- z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
- tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
- tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
+ z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+ tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+ tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
tmp6 = tmp12 - tmp7;
tmp5 = tmp11 - tmp6;
@@ -224,14 +224,14 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage: float->int conversion and range-limit */
- outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK];
- outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK];
- outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK];
- outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK];
- outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK];
- outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK];
- outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK];
- outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK];
+ outptr[0] = range_limit[((int)(tmp0 + tmp7)) & RANGE_MASK];
+ outptr[7] = range_limit[((int)(tmp0 - tmp7)) & RANGE_MASK];
+ outptr[1] = range_limit[((int)(tmp1 + tmp6)) & RANGE_MASK];
+ outptr[6] = range_limit[((int)(tmp1 - tmp6)) & RANGE_MASK];
+ outptr[2] = range_limit[((int)(tmp2 + tmp5)) & RANGE_MASK];
+ outptr[5] = range_limit[((int)(tmp2 - tmp5)) & RANGE_MASK];
+ outptr[3] = range_limit[((int)(tmp3 + tmp4)) & RANGE_MASK];
+ outptr[4] = range_limit[((int)(tmp3 - tmp4)) & RANGE_MASK];
wsptr += DCTSIZE; /* advance pointer to next row */
}
diff --git a/media/libjpeg/jidctfst.c b/media/libjpeg/jidctfst.c
index 10db739b86..89a20c937b 100644
--- a/media/libjpeg/jidctfst.c
+++ b/media/libjpeg/jidctfst.c
@@ -92,10 +92,10 @@
*/
#if CONST_BITS == 8
-#define FIX_1_082392200 ((JLONG) 277) /* FIX(1.082392200) */
-#define FIX_1_414213562 ((JLONG) 362) /* FIX(1.414213562) */
-#define FIX_1_847759065 ((JLONG) 473) /* FIX(1.847759065) */
-#define FIX_2_613125930 ((JLONG) 669) /* FIX(2.613125930) */
+#define FIX_1_082392200 ((JLONG)277) /* FIX(1.082392200) */
+#define FIX_1_414213562 ((JLONG)362) /* FIX(1.414213562) */
+#define FIX_1_847759065 ((JLONG)473) /* FIX(1.847759065) */
+#define FIX_2_613125930 ((JLONG)669) /* FIX(2.613125930) */
#else
#define FIX_1_082392200 FIX(1.082392200)
#define FIX_1_414213562 FIX(1.414213562)
@@ -111,7 +111,7 @@
#ifndef USE_ACCURATE_ROUNDING
#undef DESCALE
-#define DESCALE(x,n) RIGHT_SHIFT(x, n)
+#define DESCALE(x, n) RIGHT_SHIFT(x, n)
#endif
@@ -119,7 +119,7 @@
* descale to yield a DCTELEM result.
*/
-#define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+#define MULTIPLY(var, const) ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
/* Dequantize a coefficient by multiplying it by the multiplier-table
@@ -129,10 +129,10 @@
*/
#if BITS_IN_JSAMPLE == 8
-#define DEQUANTIZE(coef,quantval) (((IFAST_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval) (((IFAST_MULT_TYPE)(coef)) * (quantval))
#else
-#define DEQUANTIZE(coef,quantval) \
- DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
+#define DEQUANTIZE(coef, quantval) \
+ DESCALE((coef) * (quantval), IFAST_SCALE_BITS - PASS1_BITS)
#endif
@@ -147,19 +147,19 @@
#else
#define DCTELEMBITS 32 /* DCTELEM must be 32 bits */
#endif
-#define IRIGHT_SHIFT(x,shft) \
- ((ishift_temp = (x)) < 0 ? \
- (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
- (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+ ((ishift_temp = (x)) < 0 ? \
+ (ishift_temp >> (shft)) | ((~((DCTELEM)0)) << (DCTELEMBITS - (shft))) : \
+ (ishift_temp >> (shft)))
#else
#define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft) ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
#endif
#ifdef USE_ACCURATE_ROUNDING
-#define IDESCALE(x,n) ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
+#define IDESCALE(x, n) ((int)IRIGHT_SHIFT((x) + (1 << ((n) - 1)), n))
#else
-#define IDESCALE(x,n) ((int) IRIGHT_SHIFT(x, n))
+#define IDESCALE(x, n) ((int)IRIGHT_SHIFT(x, n))
#endif
@@ -168,9 +168,9 @@
*/
GLOBAL(void)
-jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -188,7 +188,7 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+ quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = DCTSIZE; ctr > 0; ctr--) {
/* Due to quantization, we will usually find that many of the input
@@ -200,21 +200,21 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
* column DCT calculations can be simplified this way.
*/
- if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
- inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
- inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
- inptr[DCTSIZE*7] == 0) {
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+ inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+ inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+ inptr[DCTSIZE * 7] == 0) {
/* AC terms all zero */
- int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ int dcval = (int)DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
- wsptr[DCTSIZE*0] = dcval;
- wsptr[DCTSIZE*1] = dcval;
- wsptr[DCTSIZE*2] = dcval;
- wsptr[DCTSIZE*3] = dcval;
- wsptr[DCTSIZE*4] = dcval;
- wsptr[DCTSIZE*5] = dcval;
- wsptr[DCTSIZE*6] = dcval;
- wsptr[DCTSIZE*7] = dcval;
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
+ wsptr[DCTSIZE * 2] = dcval;
+ wsptr[DCTSIZE * 3] = dcval;
+ wsptr[DCTSIZE * 4] = dcval;
+ wsptr[DCTSIZE * 5] = dcval;
+ wsptr[DCTSIZE * 6] = dcval;
+ wsptr[DCTSIZE * 7] = dcval;
inptr++; /* advance pointers to next column */
quantptr++;
@@ -224,10 +224,10 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
- tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
- tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
- tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
tmp10 = tmp0 + tmp2; /* phase 3 */
tmp11 = tmp0 - tmp2;
@@ -242,10 +242,10 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
z13 = tmp6 + tmp5; /* phase 6 */
z10 = tmp6 - tmp5;
@@ -257,20 +257,20 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
- tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+ tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
tmp6 = tmp12 - tmp7; /* phase 2 */
tmp5 = tmp11 - tmp6;
tmp4 = tmp10 + tmp5;
- wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
- wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
- wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
- wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
- wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
- wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
- wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
- wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
+ wsptr[DCTSIZE * 0] = (int)(tmp0 + tmp7);
+ wsptr[DCTSIZE * 7] = (int)(tmp0 - tmp7);
+ wsptr[DCTSIZE * 1] = (int)(tmp1 + tmp6);
+ wsptr[DCTSIZE * 6] = (int)(tmp1 - tmp6);
+ wsptr[DCTSIZE * 2] = (int)(tmp2 + tmp5);
+ wsptr[DCTSIZE * 5] = (int)(tmp2 - tmp5);
+ wsptr[DCTSIZE * 4] = (int)(tmp3 + tmp4);
+ wsptr[DCTSIZE * 3] = (int)(tmp3 - tmp4);
inptr++; /* advance pointers to next column */
quantptr++;
@@ -296,8 +296,8 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
/* AC terms all zero */
- JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
- & RANGE_MASK];
+ JSAMPLE dcval =
+ range_limit[IDESCALE(wsptr[0], PASS1_BITS + 3) & RANGE_MASK];
outptr[0] = dcval;
outptr[1] = dcval;
@@ -315,12 +315,12 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
- tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
- tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
+ tmp10 = ((DCTELEM)wsptr[0] + (DCTELEM)wsptr[4]);
+ tmp11 = ((DCTELEM)wsptr[0] - (DCTELEM)wsptr[4]);
- tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
- tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
- - tmp13;
+ tmp13 = ((DCTELEM)wsptr[2] + (DCTELEM)wsptr[6]);
+ tmp12 =
+ MULTIPLY((DCTELEM)wsptr[2] - (DCTELEM)wsptr[6], FIX_1_414213562) - tmp13;
tmp0 = tmp10 + tmp13;
tmp3 = tmp10 - tmp13;
@@ -329,17 +329,17 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
- z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
- z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
- z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
+ z13 = (DCTELEM)wsptr[5] + (DCTELEM)wsptr[3];
+ z10 = (DCTELEM)wsptr[5] - (DCTELEM)wsptr[3];
+ z11 = (DCTELEM)wsptr[1] + (DCTELEM)wsptr[7];
+ z12 = (DCTELEM)wsptr[1] - (DCTELEM)wsptr[7];
tmp7 = z11 + z13; /* phase 5 */
tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
- tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+ tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
tmp6 = tmp12 - tmp7; /* phase 2 */
tmp5 = tmp11 - tmp6;
@@ -347,22 +347,22 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage: scale down by a factor of 8 and range-limit */
- outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] =
+ range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[7] =
+ range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[1] =
+ range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[6] =
+ range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[2] =
+ range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[5] =
+ range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[4] =
+ range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[3] =
+ range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS + 3) & RANGE_MASK];
wsptr += DCTSIZE; /* advance pointer to next row */
}
diff --git a/media/libjpeg/jidctint.c b/media/libjpeg/jidctint.c
index 3ac6caf692..bb08748019 100644
--- a/media/libjpeg/jidctint.c
+++ b/media/libjpeg/jidctint.c
@@ -1,15 +1,15 @@
/*
* jidctint.c
*
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
- * Modification developed 2002-2009 by Guido Vollbeding.
+ * Modification developed 2002-2018 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
- * This file contains a slow-but-accurate integer implementation of the
+ * This file contains a slower but more accurate integer implementation of the
* inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
* must also perform dequantization of the input coefficients.
*
@@ -115,18 +115,18 @@
*/
#if CONST_BITS == 13
-#define FIX_0_298631336 ((JLONG) 2446) /* FIX(0.298631336) */
-#define FIX_0_390180644 ((JLONG) 3196) /* FIX(0.390180644) */
-#define FIX_0_541196100 ((JLONG) 4433) /* FIX(0.541196100) */
-#define FIX_0_765366865 ((JLONG) 6270) /* FIX(0.765366865) */
-#define FIX_0_899976223 ((JLONG) 7373) /* FIX(0.899976223) */
-#define FIX_1_175875602 ((JLONG) 9633) /* FIX(1.175875602) */
-#define FIX_1_501321110 ((JLONG) 12299) /* FIX(1.501321110) */
-#define FIX_1_847759065 ((JLONG) 15137) /* FIX(1.847759065) */
-#define FIX_1_961570560 ((JLONG) 16069) /* FIX(1.961570560) */
-#define FIX_2_053119869 ((JLONG) 16819) /* FIX(2.053119869) */
-#define FIX_2_562915447 ((JLONG) 20995) /* FIX(2.562915447) */
-#define FIX_3_072711026 ((JLONG) 25172) /* FIX(3.072711026) */
+#define FIX_0_298631336 ((JLONG)2446) /* FIX(0.298631336) */
+#define FIX_0_390180644 ((JLONG)3196) /* FIX(0.390180644) */
+#define FIX_0_541196100 ((JLONG)4433) /* FIX(0.541196100) */
+#define FIX_0_765366865 ((JLONG)6270) /* FIX(0.765366865) */
+#define FIX_0_899976223 ((JLONG)7373) /* FIX(0.899976223) */
+#define FIX_1_175875602 ((JLONG)9633) /* FIX(1.175875602) */
+#define FIX_1_501321110 ((JLONG)12299) /* FIX(1.501321110) */
+#define FIX_1_847759065 ((JLONG)15137) /* FIX(1.847759065) */
+#define FIX_1_961570560 ((JLONG)16069) /* FIX(1.961570560) */
+#define FIX_2_053119869 ((JLONG)16819) /* FIX(2.053119869) */
+#define FIX_2_562915447 ((JLONG)20995) /* FIX(2.562915447) */
+#define FIX_3_072711026 ((JLONG)25172) /* FIX(3.072711026) */
#else
#define FIX_0_298631336 FIX(0.298631336)
#define FIX_0_390180644 FIX(0.390180644)
@@ -151,9 +151,9 @@
*/
#if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const) MULTIPLY16C16(var, const)
#else
-#define MULTIPLY(var,const) ((var) * (const))
+#define MULTIPLY(var, const) ((var) * (const))
#endif
@@ -162,7 +162,7 @@
* are 16 bits or less, so either int or short multiply will work.
*/
-#define DEQUANTIZE(coef,quantval) (((ISLOW_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval) (((ISLOW_MULT_TYPE)(coef)) * (quantval))
/*
@@ -170,9 +170,9 @@
*/
GLOBAL(void)
-jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp0, tmp1, tmp2, tmp3;
JLONG tmp10, tmp11, tmp12, tmp13;
@@ -191,7 +191,7 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* furthermore, we scale the results by 2**PASS1_BITS. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = DCTSIZE; ctr > 0; ctr--) {
/* Due to quantization, we will usually find that many of the input
@@ -203,22 +203,22 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
* column DCT calculations can be simplified this way.
*/
- if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
- inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
- inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
- inptr[DCTSIZE*7] == 0) {
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+ inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+ inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+ inptr[DCTSIZE * 7] == 0) {
/* AC terms all zero */
- int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
- PASS1_BITS);
-
- wsptr[DCTSIZE*0] = dcval;
- wsptr[DCTSIZE*1] = dcval;
- wsptr[DCTSIZE*2] = dcval;
- wsptr[DCTSIZE*3] = dcval;
- wsptr[DCTSIZE*4] = dcval;
- wsptr[DCTSIZE*5] = dcval;
- wsptr[DCTSIZE*6] = dcval;
- wsptr[DCTSIZE*7] = dcval;
+ int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+ quantptr[DCTSIZE * 0]), PASS1_BITS);
+
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
+ wsptr[DCTSIZE * 2] = dcval;
+ wsptr[DCTSIZE * 3] = dcval;
+ wsptr[DCTSIZE * 4] = dcval;
+ wsptr[DCTSIZE * 5] = dcval;
+ wsptr[DCTSIZE * 6] = dcval;
+ wsptr[DCTSIZE * 7] = dcval;
inptr++; /* advance pointers to next column */
quantptr++;
@@ -229,15 +229,15 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part: reverse the even part of the forward DCT. */
/* The rotator is sqrt(2)*c(-6). */
- z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
- tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+ tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
- z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS);
tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS);
@@ -251,10 +251,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
* transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
*/
- tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
- tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ tmp3 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
z1 = tmp0 + tmp3;
z2 = tmp1 + tmp2;
@@ -266,10 +266,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+ z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
z3 += z5;
z4 += z5;
@@ -281,14 +281,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
- wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE * 0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS - PASS1_BITS);
inptr++; /* advance pointers to next column */
quantptr++;
@@ -314,8 +314,8 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
/* AC terms all zero */
- JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
- & RANGE_MASK];
+ JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+ PASS1_BITS + 3) & RANGE_MASK];
outptr[0] = dcval;
outptr[1] = dcval;
@@ -334,15 +334,15 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part: reverse the even part of the forward DCT. */
/* The rotator is sqrt(2)*c(-6). */
- z2 = (JLONG) wsptr[2];
- z3 = (JLONG) wsptr[6];
+ z2 = (JLONG)wsptr[2];
+ z3 = (JLONG)wsptr[6];
z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
- tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+ tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
- tmp0 = LEFT_SHIFT((JLONG) wsptr[0] + (JLONG) wsptr[4], CONST_BITS);
- tmp1 = LEFT_SHIFT((JLONG) wsptr[0] - (JLONG) wsptr[4], CONST_BITS);
+ tmp0 = LEFT_SHIFT((JLONG)wsptr[0] + (JLONG)wsptr[4], CONST_BITS);
+ tmp1 = LEFT_SHIFT((JLONG)wsptr[0] - (JLONG)wsptr[4], CONST_BITS);
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
@@ -353,10 +353,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
* transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
*/
- tmp0 = (JLONG) wsptr[7];
- tmp1 = (JLONG) wsptr[5];
- tmp2 = (JLONG) wsptr[3];
- tmp3 = (JLONG) wsptr[1];
+ tmp0 = (JLONG)wsptr[7];
+ tmp1 = (JLONG)wsptr[5];
+ tmp2 = (JLONG)wsptr[3];
+ tmp3 = (JLONG)wsptr[1];
z1 = tmp0 + tmp3;
z2 = tmp1 + tmp2;
@@ -368,10 +368,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+ z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
z3 += z5;
z4 += z5;
@@ -383,30 +383,30 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
- outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)DESCALE(tmp10 - tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)DESCALE(tmp11 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)DESCALE(tmp11 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)DESCALE(tmp12 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)DESCALE(tmp12 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)DESCALE(tmp13 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)DESCALE(tmp13 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += DCTSIZE; /* advance pointer to next row */
}
@@ -417,16 +417,16 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/*
* Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 7x7 output block.
+ * producing a reduced-size 7x7 output block.
*
* Optimized algorithm with 12 multiplications in the 1-D kernel.
* cK represents sqrt(2) * cos(K*pi/14).
*/
GLOBAL(void)
-jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
JLONG z1, z2, z3;
@@ -436,25 +436,25 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[7*7]; /* buffers data between passes */
+ int workspace[7 * 7]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp13 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
/* Add fudge factor here for final descale. */
- tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp13 += ONE << (CONST_BITS - PASS1_BITS - 1);
- z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
@@ -468,15 +468,15 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
tmp0 = tmp1 - tmp2;
tmp1 += tmp2;
- tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */
+ tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276)); /* -c1 */
tmp1 += tmp2;
z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
tmp0 += z2;
@@ -484,13 +484,13 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
- wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
- wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
- wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
- wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
- wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
- wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[7 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 6] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 5] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 4] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 3] = (int)RIGHT_SHIFT(tmp13, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 7 rows from work array, store into output array. */
@@ -502,12 +502,12 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- tmp13 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp13 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
- z1 = (JLONG) wsptr[2];
- z2 = (JLONG) wsptr[4];
- z3 = (JLONG) wsptr[6];
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[4];
+ z3 = (JLONG)wsptr[6];
tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
@@ -521,15 +521,15 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z3 = (JLONG) wsptr[5];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
tmp0 = tmp1 - tmp2;
tmp1 += tmp2;
- tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */
+ tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276)); /* -c1 */
tmp1 += tmp2;
z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
tmp0 += z2;
@@ -537,27 +537,27 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 7; /* advance pointer to next row */
}
@@ -573,9 +573,9 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
JLONG z1, z2, z3;
@@ -585,35 +585,35 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[6*6]; /* buffers data between passes */
+ int workspace[6 * 6]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
/* Add fudge factor here for final descale. */
- tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
- tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
tmp1 = tmp0 + tmp10;
- tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
- tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS - PASS1_BITS);
+ tmp10 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
tmp10 = tmp1 + tmp0;
tmp12 = tmp1 - tmp0;
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
@@ -621,12 +621,12 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
- wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
- wsptr[6*1] = (int) (tmp11 + tmp1);
- wsptr[6*4] = (int) (tmp11 - tmp1);
- wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
- wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[6 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[6 * 5] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[6 * 1] = (int)(tmp11 + tmp1);
+ wsptr[6 * 4] = (int)(tmp11 - tmp1);
+ wsptr[6 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[6 * 3] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 6 rows from work array, store into output array. */
@@ -638,22 +638,22 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
- tmp2 = (JLONG) wsptr[4];
+ tmp2 = (JLONG)wsptr[4];
tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
tmp1 = tmp0 + tmp10;
tmp11 = tmp0 - tmp10 - tmp10;
- tmp10 = (JLONG) wsptr[2];
+ tmp10 = (JLONG)wsptr[2];
tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
tmp10 = tmp1 + tmp0;
tmp12 = tmp1 - tmp0;
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z3 = (JLONG) wsptr[5];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
@@ -661,24 +661,24 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 6; /* advance pointer to next row */
}
@@ -694,9 +694,9 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
JLONG z1, z2, z3;
@@ -706,23 +706,23 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[5*5]; /* buffers data between passes */
+ int workspace[5 * 5]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp12 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
/* Add fudge factor here for final descale. */
- tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
- tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ tmp12 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
z3 = tmp12 + z2;
@@ -732,8 +732,8 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
@@ -741,11 +741,11 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
- wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
- wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
- wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
- wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[5 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[5 * 4] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[5 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[5 * 3] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[5 * 2] = (int)RIGHT_SHIFT(tmp12, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 5 rows from work array, store into output array. */
@@ -757,10 +757,10 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- tmp12 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp12 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
- tmp0 = (JLONG) wsptr[2];
- tmp1 = (JLONG) wsptr[4];
+ tmp0 = (JLONG)wsptr[2];
+ tmp1 = (JLONG)wsptr[4];
z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
z3 = tmp12 + z2;
@@ -770,8 +770,8 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z2 = (JLONG) wsptr[1];
- z3 = (JLONG) wsptr[3];
+ z2 = (JLONG)wsptr[1];
+ z3 = (JLONG)wsptr[3];
z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
@@ -779,21 +779,21 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 5; /* advance pointer to next row */
}
@@ -809,9 +809,9 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp0, tmp2, tmp10, tmp12;
JCOEFPTR inptr;
@@ -820,36 +820,36 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[3*3]; /* buffers data between passes */
+ int workspace[3 * 3]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
/* Add fudge factor here for final descale. */
- tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
- tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
tmp10 = tmp0 + tmp12;
tmp2 = tmp0 - tmp12 - tmp12;
/* Odd part */
- tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ tmp12 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
/* Final output stage */
- wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
- wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
- wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[3 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[3 * 2] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[3 * 1] = (int)RIGHT_SHIFT(tmp2, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 3 rows from work array, store into output array. */
@@ -861,29 +861,29 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
- tmp2 = (JLONG) wsptr[2];
+ tmp2 = (JLONG)wsptr[2];
tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
tmp10 = tmp0 + tmp12;
tmp2 = tmp0 - tmp12 - tmp12;
/* Odd part */
- tmp12 = (JLONG) wsptr[1];
+ tmp12 = (JLONG)wsptr[1];
tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 3; /* advance pointer to next row */
}
@@ -899,9 +899,9 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
JLONG z1, z2, z3, z4;
@@ -911,25 +911,25 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[8*9]; /* buffers data between passes */
+ int workspace[8 * 9]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
/* Add fudge factor here for final descale. */
- tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
- z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */
tmp1 = tmp0 + tmp3;
@@ -949,12 +949,12 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
- z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */
+ z2 = MULTIPLY(z2, -FIX(1.224744871)); /* -c3 */
tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */
tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */
@@ -966,15 +966,15 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
- wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
- wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
- wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
- wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
- wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
- wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
- wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
- wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp14, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 9 rows from work array, store into output array. */
@@ -986,12 +986,12 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
- z1 = (JLONG) wsptr[2];
- z2 = (JLONG) wsptr[4];
- z3 = (JLONG) wsptr[6];
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[4];
+ z3 = (JLONG)wsptr[6];
tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */
tmp1 = tmp0 + tmp3;
@@ -1011,12 +1011,12 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z3 = (JLONG) wsptr[5];
- z4 = (JLONG) wsptr[7];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
- z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */
+ z2 = MULTIPLY(z2, -FIX(1.224744871)); /* -c3 */
tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */
tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */
@@ -1028,33 +1028,33 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13 + tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp13 - tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 8; /* advance pointer to next row */
}
@@ -1070,9 +1070,9 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
@@ -1083,32 +1083,32 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[8*10]; /* buffers data between passes */
+ int workspace[8 * 10]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
z3 = LEFT_SHIFT(z3, CONST_BITS);
/* Add fudge factor here for final descale. */
- z3 += ONE << (CONST_BITS-PASS1_BITS-1);
- z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
tmp10 = z3 + z1;
tmp11 = z3 - z2;
tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1),
- CONST_BITS-PASS1_BITS); /* c0 = (c4-c8)*2 */
+ CONST_BITS - PASS1_BITS); /* c0 = (c4-c8)*2 */
- z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1121,10 +1121,10 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
tmp11 = z2 + z4;
tmp13 = z2 - z4;
@@ -1148,16 +1148,16 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*2] = (int) (tmp22 + tmp12);
- wsptr[8*7] = (int) (tmp22 - tmp12);
- wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)(tmp22 + tmp12);
+ wsptr[8 * 7] = (int)(tmp22 - tmp12);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 10 rows from work array, store into output array. */
@@ -1169,9 +1169,9 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
z3 = LEFT_SHIFT(z3, CONST_BITS);
- z4 = (JLONG) wsptr[4];
+ z4 = (JLONG)wsptr[4];
z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
tmp10 = z3 + z1;
@@ -1179,8 +1179,8 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1); /* c0 = (c4-c8)*2 */
- z2 = (JLONG) wsptr[2];
- z3 = (JLONG) wsptr[6];
+ z2 = (JLONG)wsptr[2];
+ z3 = (JLONG)wsptr[6];
z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1193,11 +1193,11 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z3 = (JLONG) wsptr[5];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
z3 = LEFT_SHIFT(z3, CONST_BITS);
- z4 = (JLONG) wsptr[7];
+ z4 = (JLONG)wsptr[7];
tmp11 = z2 + z4;
tmp13 = z2 - z4;
@@ -1220,36 +1220,36 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 8; /* advance pointer to next row */
}
@@ -1258,16 +1258,16 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/*
* Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 11x11 output block.
+ * producing an 11x11 output block.
*
* Optimized algorithm with 24 multiplications in the 1-D kernel.
* cK represents sqrt(2) * cos(K*pi/22).
*/
GLOBAL(void)
-jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1278,30 +1278,30 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[8*11]; /* buffers data between passes */
+ int workspace[8 * 11]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp10 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
/* Add fudge factor here for final descale. */
- tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp10 += ONE << (CONST_BITS - PASS1_BITS - 1);
- z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */
tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */
z4 = z1 + z3;
- tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */
+ tmp24 = MULTIPLY(z4, -FIX(1.155664402)); /* -(c2-c10) */
z4 -= z2;
tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */
tmp21 = tmp20 + tmp23 + tmp25 -
@@ -1316,10 +1316,10 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
tmp11 = z1 + z2;
tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1331,26 +1331,26 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */
tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */
- z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */
+ z1 = MULTIPLY(z2 + z4, -FIX(1.798248910)); /* -(c1+c9) */
tmp11 += z1;
tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */
- tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */
+ tmp14 += MULTIPLY(z2, -FIX(1.467221301)) + /* -(c5+c9) */
MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */
MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */
/* Final output stage */
- wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*9] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*8] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*7] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*6] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*5] = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 11 rows from work array, store into output array. */
@@ -1362,17 +1362,17 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- tmp10 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp10 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
- z1 = (JLONG) wsptr[2];
- z2 = (JLONG) wsptr[4];
- z3 = (JLONG) wsptr[6];
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[4];
+ z3 = (JLONG)wsptr[6];
tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */
tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */
z4 = z1 + z3;
- tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */
+ tmp24 = MULTIPLY(z4, -FIX(1.155664402)); /* -(c2-c10) */
z4 -= z2;
tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */
tmp21 = tmp20 + tmp23 + tmp25 -
@@ -1387,10 +1387,10 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z3 = (JLONG) wsptr[5];
- z4 = (JLONG) wsptr[7];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
tmp11 = z1 + z2;
tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1402,48 +1402,48 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */
tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */
- z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */
+ z1 = MULTIPLY(z2 + z4, -FIX(1.798248910)); /* -(c1+c9) */
tmp11 += z1;
tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */
- tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */
+ tmp14 += MULTIPLY(z2, -FIX(1.467221301)) + /* -(c5+c9) */
MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */
MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 8; /* advance pointer to next row */
}
@@ -1459,9 +1459,9 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1472,32 +1472,32 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[8*12]; /* buffers data between passes */
+ int workspace[8 * 12]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
z3 = LEFT_SHIFT(z3, CONST_BITS);
/* Add fudge factor here for final descale. */
- z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+ z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
- z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
tmp10 = z3 + z4;
tmp11 = z3 - z4;
- z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
z1 = LEFT_SHIFT(z1, CONST_BITS);
- z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
z2 = LEFT_SHIFT(z2, CONST_BITS);
tmp12 = z1 - z2;
@@ -1517,19 +1517,19 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
- tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */
+ tmp14 = MULTIPLY(z2, -FIX_0_541196100); /* -c9 */
tmp10 = z1 + z3;
tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
- tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */
+ tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580)); /* -(c7+c11) */
tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
@@ -1543,18 +1543,18 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*9] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*8] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*7] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
- wsptr[8*6] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 12 rows from work array, store into output array. */
@@ -1566,19 +1566,19 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
z3 = LEFT_SHIFT(z3, CONST_BITS);
- z4 = (JLONG) wsptr[4];
+ z4 = (JLONG)wsptr[4];
z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
tmp10 = z3 + z4;
tmp11 = z3 - z4;
- z1 = (JLONG) wsptr[2];
+ z1 = (JLONG)wsptr[2];
z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
z1 = LEFT_SHIFT(z1, CONST_BITS);
- z2 = (JLONG) wsptr[6];
+ z2 = (JLONG)wsptr[6];
z2 = LEFT_SHIFT(z2, CONST_BITS);
tmp12 = z1 - z2;
@@ -1598,19 +1598,19 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z3 = (JLONG) wsptr[5];
- z4 = (JLONG) wsptr[7];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
- tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */
+ tmp14 = MULTIPLY(z2, -FIX_0_541196100); /* -c9 */
tmp10 = z1 + z3;
tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
- tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */
+ tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580)); /* -(c7+c11) */
tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
@@ -1624,42 +1624,42 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 8; /* advance pointer to next row */
}
@@ -1675,9 +1675,9 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1688,25 +1688,25 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[8*13]; /* buffers data between passes */
+ int workspace[8 * 13]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
z1 = LEFT_SHIFT(z1, CONST_BITS);
/* Add fudge factor here for final descale. */
- z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+ z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
- z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
tmp10 = z3 + z4;
tmp11 = z3 - z4;
@@ -1721,22 +1721,22 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */
tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */
- tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+ tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */
tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */
- tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
- tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+ tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+ tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */
tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */
@@ -1744,13 +1744,13 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */
tmp10 = tmp11 + tmp12 + tmp13 -
MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */
- tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */
+ tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458)); /* -c11 */
tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
- tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */
+ tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945)); /* -c5 */
tmp11 += tmp14;
tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
- tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */
+ tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813)); /* -c9 */
tmp12 += tmp14;
tmp13 += tmp14;
tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */
@@ -1763,19 +1763,19 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*9] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*8] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
- wsptr[8*7] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
- wsptr[8*6] = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp26, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 13 rows from work array, store into output array. */
@@ -1787,12 +1787,12 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
z1 = LEFT_SHIFT(z1, CONST_BITS);
- z2 = (JLONG) wsptr[2];
- z3 = (JLONG) wsptr[4];
- z4 = (JLONG) wsptr[6];
+ z2 = (JLONG)wsptr[2];
+ z3 = (JLONG)wsptr[4];
+ z4 = (JLONG)wsptr[6];
tmp10 = z3 + z4;
tmp11 = z3 - z4;
@@ -1807,22 +1807,22 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */
tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */
- tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+ tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */
tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */
- tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
- tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+ tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+ tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z3 = (JLONG) wsptr[5];
- z4 = (JLONG) wsptr[7];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */
tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */
@@ -1830,13 +1830,13 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */
tmp10 = tmp11 + tmp12 + tmp13 -
MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */
- tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */
+ tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458)); /* -c11 */
tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
- tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */
+ tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945)); /* -c5 */
tmp11 += tmp14;
tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
- tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */
+ tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813)); /* -c9 */
tmp12 += tmp14;
tmp13 += tmp14;
tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */
@@ -1849,45 +1849,45 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp26,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 8; /* advance pointer to next row */
}
@@ -1903,9 +1903,9 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1916,22 +1916,22 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[8*14]; /* buffers data between passes */
+ int workspace[8 * 14]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
z1 = LEFT_SHIFT(z1, CONST_BITS);
/* Add fudge factor here for final descale. */
- z1 += ONE << (CONST_BITS-PASS1_BITS-1);
- z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
@@ -1941,10 +1941,10 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp12 = z1 - z4;
tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1),
- CONST_BITS-PASS1_BITS); /* c0 = (c4+c12-c8)*2 */
+ CONST_BITS - PASS1_BITS); /* c0 = (c4+c12-c8)*2 */
- z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
@@ -1962,10 +1962,10 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
tmp13 = LEFT_SHIFT(z4, CONST_BITS);
tmp14 = z1 + z3;
@@ -1978,7 +1978,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13; /* c11 */
tmp16 += tmp15;
z1 += z4;
- z4 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+ z4 = MULTIPLY(z2 + z3, -FIX(0.158341681)) - tmp13; /* -c13 */
tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
z4 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
@@ -1989,20 +1989,20 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*3] = (int) (tmp23 + tmp13);
- wsptr[8*10] = (int) (tmp23 - tmp13);
- wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*9] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
- wsptr[8*8] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
- wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
- wsptr[8*7] = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)(tmp23 + tmp13);
+ wsptr[8 * 10] = (int)(tmp23 - tmp13);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 14 rows from work array, store into output array. */
@@ -2014,9 +2014,9 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
z1 = LEFT_SHIFT(z1, CONST_BITS);
- z4 = (JLONG) wsptr[4];
+ z4 = (JLONG)wsptr[4];
z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
@@ -2027,8 +2027,8 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1); /* c0 = (c4+c12-c8)*2 */
- z1 = (JLONG) wsptr[2];
- z2 = (JLONG) wsptr[6];
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[6];
z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
@@ -2046,10 +2046,10 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z3 = (JLONG) wsptr[5];
- z4 = (JLONG) wsptr[7];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
z4 = LEFT_SHIFT(z4, CONST_BITS);
tmp14 = z1 + z3;
@@ -2061,7 +2061,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
z1 -= z2;
tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4; /* c11 */
tmp16 += tmp15;
- tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4; /* -c13 */
+ tmp13 = MULTIPLY(z2 + z3, -FIX(0.158341681)) - z4; /* -c13 */
tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
@@ -2072,48 +2072,48 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 8; /* advance pointer to next row */
}
@@ -2129,9 +2129,9 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2142,25 +2142,25 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[8*15]; /* buffers data between passes */
+ int workspace[8 * 15]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
z1 = LEFT_SHIFT(z1, CONST_BITS);
/* Add fudge factor here for final descale. */
- z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+ z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
- z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
@@ -2195,19 +2195,19 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */
- z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
tmp13 = z2 - z4;
tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */
tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */
tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */
- tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */
- tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */
+ tmp13 = MULTIPLY(z2, -FIX(0.831253876)); /* -c9 */
+ tmp15 = MULTIPLY(z2, -FIX(1.344997024)); /* -c3 */
z2 = z1 - z4;
tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */
@@ -2220,21 +2220,21 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
- wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
- wsptr[8*9] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
- wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
- wsptr[8*8] = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
- wsptr[8*7] = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp27, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 15 rows from work array, store into output array. */
@@ -2246,12 +2246,12 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
z1 = LEFT_SHIFT(z1, CONST_BITS);
- z2 = (JLONG) wsptr[2];
- z3 = (JLONG) wsptr[4];
- z4 = (JLONG) wsptr[6];
+ z2 = (JLONG)wsptr[2];
+ z3 = (JLONG)wsptr[4];
+ z4 = (JLONG)wsptr[6];
tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
@@ -2286,19 +2286,19 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z4 = (JLONG) wsptr[5];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z4 = (JLONG)wsptr[5];
z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */
- z4 = (JLONG) wsptr[7];
+ z4 = (JLONG)wsptr[7];
tmp13 = z2 - z4;
tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */
tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */
tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */
- tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */
- tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */
+ tmp13 = MULTIPLY(z2, -FIX(0.831253876)); /* -c9 */
+ tmp15 = MULTIPLY(z2, -FIX(1.344997024)); /* -c3 */
z2 = z1 - z4;
tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */
@@ -2311,51 +2311,51 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp27,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 8; /* advance pointer to next row */
}
@@ -2371,9 +2371,9 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2384,23 +2384,23 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[8*16]; /* buffers data between passes */
+ int workspace[8 * 16]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
/* Even part */
- tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
/* Add fudge factor here for final descale. */
- tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
+ tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
- z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
@@ -2409,8 +2409,8 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp12 = tmp0 + tmp2;
tmp13 = tmp0 - tmp2;
- z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
z3 = z1 - z2;
z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
@@ -2431,10 +2431,10 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
tmp11 = z1 + z3;
@@ -2455,13 +2455,13 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */
tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */
z2 += z4;
- z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */
+ z1 = MULTIPLY(z2, -FIX(0.666655658)); /* -c11 */
tmp1 += z1;
tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */
- z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */
+ z2 = MULTIPLY(z2, -FIX(1.247225013)); /* -c5 */
tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */
tmp12 += z2;
- z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+ z2 = MULTIPLY(z3 + z4, -FIX(1.353318001)); /* -c3 */
tmp2 += z2;
tmp3 += z2;
z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */
@@ -2470,22 +2470,22 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
- wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
- wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
- wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
- wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
- wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
- wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
- wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
- wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
- wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
- wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
- wsptr[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
- wsptr[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 15] = (int)RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS - PASS1_BITS);
}
/* Pass 2: process 16 rows from work array, store into output array. */
@@ -2497,10 +2497,10 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
/* Add fudge factor here for final descale. */
- tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
- z1 = (JLONG) wsptr[4];
+ z1 = (JLONG)wsptr[4];
tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
@@ -2509,8 +2509,8 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp12 = tmp0 + tmp2;
tmp13 = tmp0 - tmp2;
- z1 = (JLONG) wsptr[2];
- z2 = (JLONG) wsptr[6];
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[6];
z3 = z1 - z2;
z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
@@ -2531,10 +2531,10 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Odd part */
- z1 = (JLONG) wsptr[1];
- z2 = (JLONG) wsptr[3];
- z3 = (JLONG) wsptr[5];
- z4 = (JLONG) wsptr[7];
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
tmp11 = z1 + z3;
@@ -2555,13 +2555,13 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */
tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */
z2 += z4;
- z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */
+ z1 = MULTIPLY(z2, -FIX(0.666655658)); /* -c11 */
tmp1 += z1;
tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */
- z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */
+ z2 = MULTIPLY(z2, -FIX(1.247225013)); /* -c5 */
tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */
tmp12 += z2;
- z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+ z2 = MULTIPLY(z3 + z4, -FIX(1.353318001)); /* -c3 */
tmp2 += z2;
tmp3 += z2;
z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */
@@ -2570,54 +2570,54 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Final output stage */
- outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
- outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
- CONST_BITS+PASS1_BITS+3)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[15] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp27 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp27 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
wsptr += 8; /* advance pointer to next row */
}
diff --git a/media/libjpeg/jidctred.c b/media/libjpeg/jidctred.c
index 7a81803b8d..1dd65a94d9 100644
--- a/media/libjpeg/jidctred.c
+++ b/media/libjpeg/jidctred.c
@@ -1,7 +1,7 @@
/*
* jidctred.c
*
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2015, D. R. Commander.
@@ -58,20 +58,20 @@
*/
#if CONST_BITS == 13
-#define FIX_0_211164243 ((JLONG) 1730) /* FIX(0.211164243) */
-#define FIX_0_509795579 ((JLONG) 4176) /* FIX(0.509795579) */
-#define FIX_0_601344887 ((JLONG) 4926) /* FIX(0.601344887) */
-#define FIX_0_720959822 ((JLONG) 5906) /* FIX(0.720959822) */
-#define FIX_0_765366865 ((JLONG) 6270) /* FIX(0.765366865) */
-#define FIX_0_850430095 ((JLONG) 6967) /* FIX(0.850430095) */
-#define FIX_0_899976223 ((JLONG) 7373) /* FIX(0.899976223) */
-#define FIX_1_061594337 ((JLONG) 8697) /* FIX(1.061594337) */
-#define FIX_1_272758580 ((JLONG) 10426) /* FIX(1.272758580) */
-#define FIX_1_451774981 ((JLONG) 11893) /* FIX(1.451774981) */
-#define FIX_1_847759065 ((JLONG) 15137) /* FIX(1.847759065) */
-#define FIX_2_172734803 ((JLONG) 17799) /* FIX(2.172734803) */
-#define FIX_2_562915447 ((JLONG) 20995) /* FIX(2.562915447) */
-#define FIX_3_624509785 ((JLONG) 29692) /* FIX(3.624509785) */
+#define FIX_0_211164243 ((JLONG)1730) /* FIX(0.211164243) */
+#define FIX_0_509795579 ((JLONG)4176) /* FIX(0.509795579) */
+#define FIX_0_601344887 ((JLONG)4926) /* FIX(0.601344887) */
+#define FIX_0_720959822 ((JLONG)5906) /* FIX(0.720959822) */
+#define FIX_0_765366865 ((JLONG)6270) /* FIX(0.765366865) */
+#define FIX_0_850430095 ((JLONG)6967) /* FIX(0.850430095) */
+#define FIX_0_899976223 ((JLONG)7373) /* FIX(0.899976223) */
+#define FIX_1_061594337 ((JLONG)8697) /* FIX(1.061594337) */
+#define FIX_1_272758580 ((JLONG)10426) /* FIX(1.272758580) */
+#define FIX_1_451774981 ((JLONG)11893) /* FIX(1.451774981) */
+#define FIX_1_847759065 ((JLONG)15137) /* FIX(1.847759065) */
+#define FIX_2_172734803 ((JLONG)17799) /* FIX(2.172734803) */
+#define FIX_2_562915447 ((JLONG)20995) /* FIX(2.562915447) */
+#define FIX_3_624509785 ((JLONG)29692) /* FIX(3.624509785) */
#else
#define FIX_0_211164243 FIX(0.211164243)
#define FIX_0_509795579 FIX(0.509795579)
@@ -98,9 +98,9 @@
*/
#if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const) MULTIPLY16C16(var, const)
#else
-#define MULTIPLY(var,const) ((var) * (const))
+#define MULTIPLY(var, const) ((var) * (const))
#endif
@@ -109,7 +109,7 @@
* are 16 bits or less, so either int or short multiply will work.
*/
-#define DEQUANTIZE(coef,quantval) (((ISLOW_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval) (((ISLOW_MULT_TYPE)(coef)) * (quantval))
/*
@@ -118,9 +118,9 @@
*/
GLOBAL(void)
-jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp0, tmp2, tmp10, tmp12;
JLONG z1, z2, z3, z4;
@@ -130,69 +130,73 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[DCTSIZE*4]; /* buffers data between passes */
+ int workspace[DCTSIZE * 4]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
/* Don't bother to process column 4, because second pass won't use it */
- if (ctr == DCTSIZE-4)
+ if (ctr == DCTSIZE - 4)
continue;
- if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
- inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 &&
- inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) {
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+ inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 5] == 0 &&
+ inptr[DCTSIZE * 6] == 0 && inptr[DCTSIZE * 7] == 0) {
/* AC terms all zero; we need not examine term 4 for 4x4 output */
- int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
- PASS1_BITS);
+ int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+ quantptr[DCTSIZE * 0]), PASS1_BITS);
- wsptr[DCTSIZE*0] = dcval;
- wsptr[DCTSIZE*1] = dcval;
- wsptr[DCTSIZE*2] = dcval;
- wsptr[DCTSIZE*3] = dcval;
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
+ wsptr[DCTSIZE * 2] = dcval;
+ wsptr[DCTSIZE * 3] = dcval;
continue;
}
/* Even part */
- tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
- tmp0 = LEFT_SHIFT(tmp0, CONST_BITS+1);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS + 1);
- z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
- tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865);
+ tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, -FIX_0_765366865);
tmp10 = tmp0 + tmp2;
tmp12 = tmp0 - tmp2;
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
- z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
- tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
- + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
- + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
- + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+ tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+ MULTIPLY(z2, FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+ MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+ MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * ( c5+c7) */
- tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
- + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
- + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
- + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+ tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+ MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+ MULTIPLY(z3, FIX_0_899976223) + /* sqrt(2) * (c3-c7) */
+ MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
/* Final output stage */
- wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1);
- wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1);
- wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1);
- wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1);
+ wsptr[DCTSIZE * 0] =
+ (int)DESCALE(tmp10 + tmp2, CONST_BITS - PASS1_BITS + 1);
+ wsptr[DCTSIZE * 3] =
+ (int)DESCALE(tmp10 - tmp2, CONST_BITS - PASS1_BITS + 1);
+ wsptr[DCTSIZE * 1] =
+ (int)DESCALE(tmp12 + tmp0, CONST_BITS - PASS1_BITS + 1);
+ wsptr[DCTSIZE * 2] =
+ (int)DESCALE(tmp12 - tmp0, CONST_BITS - PASS1_BITS + 1);
}
/* Pass 2: process 4 rows from work array, store into output array. */
@@ -206,8 +210,8 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
/* AC terms all zero */
- JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
- & RANGE_MASK];
+ JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+ PASS1_BITS + 3) & RANGE_MASK];
outptr[0] = dcval;
outptr[1] = dcval;
@@ -221,45 +225,45 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
- tmp0 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+1);
+ tmp0 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 1);
- tmp2 = MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
- + MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865);
+ tmp2 = MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) +
+ MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865);
tmp10 = tmp0 + tmp2;
tmp12 = tmp0 - tmp2;
/* Odd part */
- z1 = (JLONG) wsptr[7];
- z2 = (JLONG) wsptr[5];
- z3 = (JLONG) wsptr[3];
- z4 = (JLONG) wsptr[1];
+ z1 = (JLONG)wsptr[7];
+ z2 = (JLONG)wsptr[5];
+ z3 = (JLONG)wsptr[3];
+ z4 = (JLONG)wsptr[1];
- tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
- + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
- + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
- + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+ tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+ MULTIPLY(z2, FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+ MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+ MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * ( c5+c7) */
- tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
- + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
- + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
- + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+ tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+ MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+ MULTIPLY(z3, FIX_0_899976223) + /* sqrt(2) * (c3-c7) */
+ MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
/* Final output stage */
- outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2,
- CONST_BITS+PASS1_BITS+3+1)
- & RANGE_MASK];
- outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2,
- CONST_BITS+PASS1_BITS+3+1)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0,
- CONST_BITS+PASS1_BITS+3+1)
- & RANGE_MASK];
- outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0,
- CONST_BITS+PASS1_BITS+3+1)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp2,
+ CONST_BITS + PASS1_BITS + 3 + 1) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)DESCALE(tmp10 - tmp2,
+ CONST_BITS + PASS1_BITS + 3 + 1) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)DESCALE(tmp12 + tmp0,
+ CONST_BITS + PASS1_BITS + 3 + 1) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)DESCALE(tmp12 - tmp0,
+ CONST_BITS + PASS1_BITS + 3 + 1) &
+ RANGE_MASK];
wsptr += DCTSIZE; /* advance pointer to next row */
}
@@ -272,9 +276,9 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
JLONG tmp0, tmp10, z1;
JCOEFPTR inptr;
@@ -283,50 +287,52 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPROW outptr;
JSAMPLE *range_limit = IDCT_range_limit(cinfo);
int ctr;
- int workspace[DCTSIZE*2]; /* buffers data between passes */
+ int workspace[DCTSIZE * 2]; /* buffers data between passes */
SHIFT_TEMPS
/* Pass 1: process columns from input, store into work array. */
inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
wsptr = workspace;
for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
/* Don't bother to process columns 2,4,6 */
- if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6)
+ if (ctr == DCTSIZE - 2 || ctr == DCTSIZE - 4 || ctr == DCTSIZE - 6)
continue;
- if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 &&
- inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) {
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 3] == 0 &&
+ inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 7] == 0) {
/* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */
- int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
- PASS1_BITS);
+ int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+ quantptr[DCTSIZE * 0]), PASS1_BITS);
- wsptr[DCTSIZE*0] = dcval;
- wsptr[DCTSIZE*1] = dcval;
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
continue;
}
/* Even part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
- tmp10 = LEFT_SHIFT(z1, CONST_BITS+2);
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp10 = LEFT_SHIFT(z1, CONST_BITS + 2);
/* Odd part */
- z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
- tmp0 = MULTIPLY(z1, - FIX_0_720959822); /* sqrt(2) * (c7-c5+c3-c1) */
- z1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
- tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */
- z1 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
- tmp0 += MULTIPLY(z1, - FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
- z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+ tmp0 = MULTIPLY(z1, -FIX_0_720959822); /* sqrt(2) * ( c7-c5+c3-c1) */
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ tmp0 += MULTIPLY(z1, -FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * ( c1+c3+c5+c7) */
/* Final output stage */
- wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2);
- wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2);
+ wsptr[DCTSIZE * 0] =
+ (int)DESCALE(tmp10 + tmp0, CONST_BITS - PASS1_BITS + 2);
+ wsptr[DCTSIZE * 1] =
+ (int)DESCALE(tmp10 - tmp0, CONST_BITS - PASS1_BITS + 2);
}
/* Pass 2: process 2 rows from work array, store into output array. */
@@ -339,8 +345,8 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
#ifndef NO_ZERO_ROW_TEST
if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
/* AC terms all zero */
- JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
- & RANGE_MASK];
+ JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+ PASS1_BITS + 3) & RANGE_MASK];
outptr[0] = dcval;
outptr[1] = dcval;
@@ -352,23 +358,23 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* Even part */
- tmp10 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+2);
+ tmp10 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 2);
/* Odd part */
- tmp0 = MULTIPLY((JLONG) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
- + MULTIPLY((JLONG) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
- + MULTIPLY((JLONG) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
- + MULTIPLY((JLONG) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+ tmp0 = MULTIPLY((JLONG)wsptr[7], -FIX_0_720959822) + /* sqrt(2) * ( c7-c5+c3-c1) */
+ MULTIPLY((JLONG)wsptr[5], FIX_0_850430095) + /* sqrt(2) * (-c1+c3+c5+c7) */
+ MULTIPLY((JLONG)wsptr[3], -FIX_1_272758580) + /* sqrt(2) * (-c1+c3-c5-c7) */
+ MULTIPLY((JLONG)wsptr[1], FIX_3_624509785); /* sqrt(2) * ( c1+c3+c5+c7) */
/* Final output stage */
- outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0,
- CONST_BITS+PASS1_BITS+3+2)
- & RANGE_MASK];
- outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0,
- CONST_BITS+PASS1_BITS+3+2)
- & RANGE_MASK];
+ outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3 + 2) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)DESCALE(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3 + 2) &
+ RANGE_MASK];
wsptr += DCTSIZE; /* advance pointer to next row */
}
@@ -381,9 +387,9 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
*/
GLOBAL(void)
-jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_1x1(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
int dcval;
ISLOW_MULT_TYPE *quantptr;
@@ -393,9 +399,9 @@ jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
/* We hardly need an inverse DCT routine for this: just take the
* average pixel value, which is one-eighth of the DC coefficient.
*/
- quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
- dcval = (int) DESCALE((JLONG) dcval, 3);
+ dcval = (int)DESCALE((JLONG)dcval, 3);
output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
}
diff --git a/media/libjpeg/jinclude.h b/media/libjpeg/jinclude.h
index d461a1aa16..120614b25c 100644
--- a/media/libjpeg/jinclude.h
+++ b/media/libjpeg/jinclude.h
@@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1994, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -17,68 +17,117 @@
* JPEG library. Most applications need only include jpeglib.h.
*/
+#ifndef __JINCLUDE_H__
+#define __JINCLUDE_H__
/* Include auto-config file to find out which system include files we need. */
#include "jconfig.h" /* auto configuration options */
+#include "jconfigint.h"
#define JCONFIG_INCLUDED /* so that jpeglib.h doesn't do it again */
/*
- * We need the NULL macro and size_t typedef.
- * On an ANSI-conforming system it is sufficient to include <stddef.h>.
- * Otherwise, we get them from <stdlib.h> or <stdio.h>; we may have to
- * pull in <sys/types.h> as well.
* Note that the core JPEG library does not require <stdio.h>;
* only the default error handler and data source/destination modules do.
* But we must pull it in because of the references to FILE in jpeglib.h.
* You can remove those references if you want to compile without <stdio.h>.
*/
-#ifdef HAVE_STDDEF_H
#include <stddef.h>
-#endif
-
-#ifdef HAVE_STDLIB_H
#include <stdlib.h>
-#endif
-
-#ifdef NEED_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
#include <stdio.h>
+#include <string.h>
/*
- * We need memory copying and zeroing functions, plus strncpy().
- * ANSI and System V implementations declare these in <string.h>.
- * BSD doesn't have the mem() functions, but it does have bcopy()/bzero().
- * Some systems may declare memset and memcpy in <memory.h>.
- *
- * NOTE: we assume the size parameters to these functions are of type size_t.
- * Change the casts in these macros if not!
+ * These macros/inline functions facilitate using Microsoft's "safe string"
+ * functions with Visual Studio builds without the need to scatter #ifdefs
+ * throughout the code base.
*/
-#ifdef NEED_BSD_STRINGS
-#include <strings.h>
-#define MEMZERO(target,size) bzero((void *)(target), (size_t)(size))
-#define MEMCOPY(dest,src,size) bcopy((const void *)(src), (void *)(dest), (size_t)(size))
+#ifndef NO_GETENV
-#else /* not BSD, assume ANSI/SysV string lib */
+#ifdef _MSC_VER
-#include <string.h>
-#define MEMZERO(target,size) memset((void *)(target), 0, (size_t)(size))
-#define MEMCOPY(dest,src,size) memcpy((void *)(dest), (const void *)(src), (size_t)(size))
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+ size_t required_size;
-#endif
+ return (int)getenv_s(&required_size, buffer, buffer_size, name);
+}
-/*
- * The modules that use fread() and fwrite() always invoke them through
- * these macros. On some systems you may need to twiddle the argument casts.
- * CAUTION: argument order is different from underlying functions!
+#else /* _MSC_VER */
+
+#include <errno.h>
+
+/* This provides a similar interface to the Microsoft/C11 getenv_s() function,
+ * but other than parameter validation, it has no advantages over getenv().
+ */
+
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+ char *env;
+
+ if (!buffer) {
+ if (buffer_size == 0)
+ return 0;
+ else
+ return (errno = EINVAL);
+ }
+ if (buffer_size == 0)
+ return (errno = EINVAL);
+ if (!name) {
+ *buffer = 0;
+ return 0;
+ }
+
+ env = getenv(name);
+ if (!env)
+ {
+ *buffer = 0;
+ return 0;
+ }
+
+ if (strlen(env) + 1 > buffer_size) {
+ *buffer = 0;
+ return ERANGE;
+ }
+
+ strncpy(buffer, env, buffer_size);
+
+ return 0;
+}
+
+#endif /* _MSC_VER */
+
+#endif /* NO_GETENV */
+
+
+#ifndef NO_PUTENV
+
+#ifdef _WIN32
+
+#define PUTENV_S(name, value) _putenv_s(name, value)
+
+#else
+
+/* This provides a similar interface to the Microsoft _putenv_s() function, but
+ * other than parameter validation, it has no advantages over setenv().
*/
-#define JFREAD(file,buf,sizeofbuf) \
- ((size_t) fread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
-#define JFWRITE(file,buf,sizeofbuf) \
- ((size_t) fwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
+static INLINE int PUTENV_S(const char *name, const char *value)
+{
+ if (!name || !value)
+ return (errno = EINVAL);
+
+ setenv(name, value, 1);
+
+ return errno;
+}
+
+#endif /* _WIN32 */
+
+#endif /* NO_PUTENV */
+
+
+#endif /* JINCLUDE_H */
diff --git a/media/libjpeg/jmemmgr.c b/media/libjpeg/jmemmgr.c
index 2a8d8401f4..8f5a4ab1c7 100644
--- a/media/libjpeg/jmemmgr.c
+++ b/media/libjpeg/jmemmgr.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2016, D. R. Commander.
+ * Copyright (C) 2016, 2021-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -32,18 +32,14 @@
#include "jinclude.h"
#include "jpeglib.h"
#include "jmemsys.h" /* import the system-dependent declarations */
+#if !defined(_MSC_VER) || _MSC_VER > 1600
#include <stdint.h>
-#include <limits.h> /* some NDKs define SIZE_MAX in limits.h */
-
-#ifndef NO_GETENV
-#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare getenv() */
-extern char *getenv (const char *name);
-#endif
#endif
+#include <limits.h>
LOCAL(size_t)
-round_up_pow2 (size_t a, size_t b)
+round_up_pow2(size_t a, size_t b)
/* a rounded up to the next multiple of b, i.e. ceil(a/b)*b */
/* Assumes a >= 0, b > 0, and b is a power of 2 */
{
@@ -87,7 +83,9 @@ round_up_pow2 (size_t a, size_t b)
#ifndef WITH_SIMD
#define ALIGN_SIZE sizeof(double)
#else
-#define ALIGN_SIZE 16 /* Most SIMD implementations require this */
+#define ALIGN_SIZE 32 /* Most of the SIMD instructions we support require
+ 16-byte (128-bit) alignment, but AVX2 requires
+ 32-byte alignment. */
#endif
#endif
@@ -102,7 +100,7 @@ round_up_pow2 (size_t a, size_t b)
typedef struct small_pool_struct *small_pool_ptr;
typedef struct small_pool_struct {
- small_pool_ptr next; /* next in list of pools */
+ small_pool_ptr next; /* next in list of pools */
size_t bytes_used; /* how many bytes already used within pool */
size_t bytes_left; /* bytes still available in this pool */
} small_pool_hdr;
@@ -110,7 +108,7 @@ typedef struct small_pool_struct {
typedef struct large_pool_struct *large_pool_ptr;
typedef struct large_pool_struct {
- large_pool_ptr next; /* next in list of pools */
+ large_pool_ptr next; /* next in list of pools */
size_t bytes_used; /* how many bytes already used within pool */
size_t bytes_left; /* bytes still available in this pool */
} large_pool_hdr;
@@ -189,9 +187,9 @@ struct jvirt_barray_control {
#ifdef MEM_STATS /* optional extra stuff for statistics */
LOCAL(void)
-print_mem_stats (j_common_ptr cinfo, int pool_id)
+print_mem_stats(j_common_ptr cinfo, int pool_id)
{
- my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
small_pool_ptr shdr_ptr;
large_pool_ptr lhdr_ptr;
@@ -204,15 +202,13 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
lhdr_ptr = lhdr_ptr->next) {
- fprintf(stderr, " Large chunk used %ld\n",
- (long) lhdr_ptr->bytes_used);
+ fprintf(stderr, " Large chunk used %ld\n", (long)lhdr_ptr->bytes_used);
}
for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
shdr_ptr = shdr_ptr->next) {
fprintf(stderr, " Small chunk used %ld free %ld\n",
- (long) shdr_ptr->bytes_used,
- (long) shdr_ptr->bytes_left);
+ (long)shdr_ptr->bytes_used, (long)shdr_ptr->bytes_left);
}
}
@@ -220,7 +216,7 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
LOCAL(void)
-out_of_memory (j_common_ptr cinfo, int which)
+out_of_memory(j_common_ptr cinfo, int which)
/* Report an out-of-memory error and stop execution */
/* If we compiled MEM_STATS support, report alloc requests before dying */
{
@@ -248,26 +244,24 @@ out_of_memory (j_common_ptr cinfo, int which)
* adjustment.
*/
-static const size_t first_pool_slop[JPOOL_NUMPOOLS] =
-{
- 1600, /* first PERMANENT pool */
- 16000 /* first IMAGE pool */
+static const size_t first_pool_slop[JPOOL_NUMPOOLS] = {
+ 1600, /* first PERMANENT pool */
+ 16000 /* first IMAGE pool */
};
-static const size_t extra_pool_slop[JPOOL_NUMPOOLS] =
-{
- 0, /* additional PERMANENT pools */
- 5000 /* additional IMAGE pools */
+static const size_t extra_pool_slop[JPOOL_NUMPOOLS] = {
+ 0, /* additional PERMANENT pools */
+ 5000 /* additional IMAGE pools */
};
#define MIN_SLOP 50 /* greater than 0 to avoid futile looping */
METHODDEF(void *)
-alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+alloc_small(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
/* Allocate a "small" object */
{
- my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
small_pool_ptr hdr_ptr, prev_hdr_ptr;
char *data_ptr;
size_t min_request, slop;
@@ -311,11 +305,11 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
else
slop = extra_pool_slop[pool_id];
/* Don't ask for more than MAX_ALLOC_CHUNK */
- if (slop > (size_t) (MAX_ALLOC_CHUNK-min_request))
- slop = (size_t) (MAX_ALLOC_CHUNK-min_request);
+ if (slop > (size_t)(MAX_ALLOC_CHUNK - min_request))
+ slop = (size_t)(MAX_ALLOC_CHUNK - min_request);
/* Try to get space, if fail reduce slop and try again */
for (;;) {
- hdr_ptr = (small_pool_ptr) jpeg_get_small(cinfo, min_request + slop);
+ hdr_ptr = (small_pool_ptr)jpeg_get_small(cinfo, min_request + slop);
if (hdr_ptr != NULL)
break;
slop /= 2;
@@ -334,7 +328,7 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
}
/* OK, allocate the object from the current pool */
- data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+ data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
@@ -342,7 +336,7 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
hdr_ptr->bytes_used += sizeofobject;
hdr_ptr->bytes_left -= sizeofobject;
- return (void *) data_ptr;
+ return (void *)data_ptr;
}
@@ -360,10 +354,10 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
*/
METHODDEF(void *)
-alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+alloc_large(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
/* Allocate a "large" object */
{
- my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
large_pool_ptr hdr_ptr;
char *data_ptr;
@@ -388,9 +382,9 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
- hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject +
- sizeof(large_pool_hdr) +
- ALIGN_SIZE - 1);
+ hdr_ptr = (large_pool_ptr)jpeg_get_large(cinfo, sizeofobject +
+ sizeof(large_pool_hdr) +
+ ALIGN_SIZE - 1);
if (hdr_ptr == NULL)
out_of_memory(cinfo, 4); /* jpeg_get_large failed */
mem->total_space_allocated += sizeofobject + sizeof(large_pool_hdr) +
@@ -405,12 +399,12 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
hdr_ptr->bytes_left = 0;
mem->large_list[pool_id] = hdr_ptr;
- data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+ data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
- return (void *) data_ptr;
+ return (void *)data_ptr;
}
@@ -431,11 +425,11 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
*/
METHODDEF(JSAMPARRAY)
-alloc_sarray (j_common_ptr cinfo, int pool_id,
- JDIMENSION samplesperrow, JDIMENSION numrows)
+alloc_sarray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
+ JDIMENSION numrows)
/* Allocate a 2-D sample array */
{
- my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
JSAMPARRAY result;
JSAMPROW workspace;
JDIMENSION rowsperchunk, currow, i;
@@ -454,27 +448,27 @@ alloc_sarray (j_common_ptr cinfo, int pool_id,
sizeof(JSAMPLE));
/* Calculate max # of rows allowed in one allocation chunk */
- ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
- ((long) samplesperrow * sizeof(JSAMPLE));
+ ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+ ((long)samplesperrow * sizeof(JSAMPLE));
if (ltemp <= 0)
ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
- if (ltemp < (long) numrows)
- rowsperchunk = (JDIMENSION) ltemp;
+ if (ltemp < (long)numrows)
+ rowsperchunk = (JDIMENSION)ltemp;
else
rowsperchunk = numrows;
mem->last_rowsperchunk = rowsperchunk;
/* Get space for row pointers (small object) */
- result = (JSAMPARRAY) alloc_small(cinfo, pool_id,
- (size_t) (numrows * sizeof(JSAMPROW)));
+ result = (JSAMPARRAY)alloc_small(cinfo, pool_id,
+ (size_t)(numrows * sizeof(JSAMPROW)));
/* Get the rows themselves (large objects) */
currow = 0;
while (currow < numrows) {
rowsperchunk = MIN(rowsperchunk, numrows - currow);
- workspace = (JSAMPROW) alloc_large(cinfo, pool_id,
- (size_t) ((size_t) rowsperchunk * (size_t) samplesperrow
- * sizeof(JSAMPLE)));
+ workspace = (JSAMPROW)alloc_large(cinfo, pool_id,
+ (size_t)((size_t)rowsperchunk * (size_t)samplesperrow *
+ sizeof(JSAMPLE)));
for (i = rowsperchunk; i > 0; i--) {
result[currow++] = workspace;
workspace += samplesperrow;
@@ -491,11 +485,11 @@ alloc_sarray (j_common_ptr cinfo, int pool_id,
*/
METHODDEF(JBLOCKARRAY)
-alloc_barray (j_common_ptr cinfo, int pool_id,
- JDIMENSION blocksperrow, JDIMENSION numrows)
+alloc_barray(j_common_ptr cinfo, int pool_id, JDIMENSION blocksperrow,
+ JDIMENSION numrows)
/* Allocate a 2-D coefficient-block array */
{
- my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
JBLOCKARRAY result;
JBLOCKROW workspace;
JDIMENSION rowsperchunk, currow, i;
@@ -506,27 +500,27 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
out_of_memory(cinfo, 6); /* safety check */
/* Calculate max # of rows allowed in one allocation chunk */
- ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
- ((long) blocksperrow * sizeof(JBLOCK));
+ ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+ ((long)blocksperrow * sizeof(JBLOCK));
if (ltemp <= 0)
ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
- if (ltemp < (long) numrows)
- rowsperchunk = (JDIMENSION) ltemp;
+ if (ltemp < (long)numrows)
+ rowsperchunk = (JDIMENSION)ltemp;
else
rowsperchunk = numrows;
mem->last_rowsperchunk = rowsperchunk;
/* Get space for row pointers (small object) */
- result = (JBLOCKARRAY) alloc_small(cinfo, pool_id,
- (size_t) (numrows * sizeof(JBLOCKROW)));
+ result = (JBLOCKARRAY)alloc_small(cinfo, pool_id,
+ (size_t)(numrows * sizeof(JBLOCKROW)));
/* Get the rows themselves (large objects) */
currow = 0;
while (currow < numrows) {
rowsperchunk = MIN(rowsperchunk, numrows - currow);
- workspace = (JBLOCKROW) alloc_large(cinfo, pool_id,
- (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
- * sizeof(JBLOCK)));
+ workspace = (JBLOCKROW)alloc_large(cinfo, pool_id,
+ (size_t)((size_t)rowsperchunk * (size_t)blocksperrow *
+ sizeof(JBLOCK)));
for (i = rowsperchunk; i > 0; i--) {
result[currow++] = workspace;
workspace += blocksperrow;
@@ -575,12 +569,12 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
METHODDEF(jvirt_sarray_ptr)
-request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
- JDIMENSION samplesperrow, JDIMENSION numrows,
- JDIMENSION maxaccess)
+request_virt_sarray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+ JDIMENSION samplesperrow, JDIMENSION numrows,
+ JDIMENSION maxaccess)
/* Request a virtual 2-D sample array */
{
- my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
jvirt_sarray_ptr result;
/* Only IMAGE-lifetime virtual arrays are currently supported */
@@ -588,8 +582,8 @@ request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
/* get control block */
- result = (jvirt_sarray_ptr) alloc_small(cinfo, pool_id,
- sizeof(struct jvirt_sarray_control));
+ result = (jvirt_sarray_ptr)alloc_small(cinfo, pool_id,
+ sizeof(struct jvirt_sarray_control));
result->mem_buffer = NULL; /* marks array not yet realized */
result->rows_in_array = numrows;
@@ -605,12 +599,12 @@ request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
METHODDEF(jvirt_barray_ptr)
-request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
- JDIMENSION blocksperrow, JDIMENSION numrows,
- JDIMENSION maxaccess)
+request_virt_barray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+ JDIMENSION blocksperrow, JDIMENSION numrows,
+ JDIMENSION maxaccess)
/* Request a virtual 2-D coefficient-block array */
{
- my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
jvirt_barray_ptr result;
/* Only IMAGE-lifetime virtual arrays are currently supported */
@@ -618,8 +612,8 @@ request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
/* get control block */
- result = (jvirt_barray_ptr) alloc_small(cinfo, pool_id,
- sizeof(struct jvirt_barray_control));
+ result = (jvirt_barray_ptr)alloc_small(cinfo, pool_id,
+ sizeof(struct jvirt_barray_control));
result->mem_buffer = NULL; /* marks array not yet realized */
result->rows_in_array = numrows;
@@ -635,10 +629,10 @@ request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
METHODDEF(void)
-realize_virt_arrays (j_common_ptr cinfo)
+realize_virt_arrays(j_common_ptr cinfo)
/* Allocate the in-memory buffers for any unrealized virtual arrays */
{
- my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
size_t space_per_minheight, maximum_space, avail_mem;
size_t minheights, max_minheights;
jvirt_sarray_ptr sptr;
@@ -652,11 +646,11 @@ realize_virt_arrays (j_common_ptr cinfo)
maximum_space = 0;
for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
if (sptr->mem_buffer == NULL) { /* if not realized yet */
- size_t new_space = (long) sptr->rows_in_array *
- (long) sptr->samplesperrow * sizeof(JSAMPLE);
+ size_t new_space = (long)sptr->rows_in_array *
+ (long)sptr->samplesperrow * sizeof(JSAMPLE);
- space_per_minheight += (long) sptr->maxaccess *
- (long) sptr->samplesperrow * sizeof(JSAMPLE);
+ space_per_minheight += (long)sptr->maxaccess *
+ (long)sptr->samplesperrow * sizeof(JSAMPLE);
if (SIZE_MAX - maximum_space < new_space)
out_of_memory(cinfo, 10);
maximum_space += new_space;
@@ -664,11 +658,11 @@ realize_virt_arrays (j_common_ptr cinfo)
}
for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
if (bptr->mem_buffer == NULL) { /* if not realized yet */
- size_t new_space = (long) bptr->rows_in_array *
- (long) bptr->blocksperrow * sizeof(JBLOCK);
+ size_t new_space = (long)bptr->rows_in_array *
+ (long)bptr->blocksperrow * sizeof(JBLOCK);
- space_per_minheight += (long) bptr->maxaccess *
- (long) bptr->blocksperrow * sizeof(JBLOCK);
+ space_per_minheight += (long)bptr->maxaccess *
+ (long)bptr->blocksperrow * sizeof(JBLOCK);
if (SIZE_MAX - maximum_space < new_space)
out_of_memory(cinfo, 11);
maximum_space += new_space;
@@ -701,17 +695,17 @@ realize_virt_arrays (j_common_ptr cinfo)
for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
if (sptr->mem_buffer == NULL) { /* if not realized yet */
- minheights = ((long) sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
+ minheights = ((long)sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
if (minheights <= max_minheights) {
/* This buffer fits in memory */
sptr->rows_in_mem = sptr->rows_in_array;
} else {
/* It doesn't fit in memory, create backing store. */
- sptr->rows_in_mem = (JDIMENSION) (max_minheights * sptr->maxaccess);
- jpeg_open_backing_store(cinfo, & sptr->b_s_info,
- (long) sptr->rows_in_array *
- (long) sptr->samplesperrow *
- (long) sizeof(JSAMPLE));
+ sptr->rows_in_mem = (JDIMENSION)(max_minheights * sptr->maxaccess);
+ jpeg_open_backing_store(cinfo, &sptr->b_s_info,
+ (long)sptr->rows_in_array *
+ (long)sptr->samplesperrow *
+ (long)sizeof(JSAMPLE));
sptr->b_s_open = TRUE;
}
sptr->mem_buffer = alloc_sarray(cinfo, JPOOL_IMAGE,
@@ -725,17 +719,17 @@ realize_virt_arrays (j_common_ptr cinfo)
for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
if (bptr->mem_buffer == NULL) { /* if not realized yet */
- minheights = ((long) bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
+ minheights = ((long)bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
if (minheights <= max_minheights) {
/* This buffer fits in memory */
bptr->rows_in_mem = bptr->rows_in_array;
} else {
/* It doesn't fit in memory, create backing store. */
- bptr->rows_in_mem = (JDIMENSION) (max_minheights * bptr->maxaccess);
- jpeg_open_backing_store(cinfo, & bptr->b_s_info,
- (long) bptr->rows_in_array *
- (long) bptr->blocksperrow *
- (long) sizeof(JBLOCK));
+ bptr->rows_in_mem = (JDIMENSION)(max_minheights * bptr->maxaccess);
+ jpeg_open_backing_store(cinfo, &bptr->b_s_info,
+ (long)bptr->rows_in_array *
+ (long)bptr->blocksperrow *
+ (long)sizeof(JBLOCK));
bptr->b_s_open = TRUE;
}
bptr->mem_buffer = alloc_barray(cinfo, JPOOL_IMAGE,
@@ -750,32 +744,32 @@ realize_virt_arrays (j_common_ptr cinfo)
LOCAL(void)
-do_sarray_io (j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
+do_sarray_io(j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
/* Do backing store read or write of a virtual sample array */
{
long bytesperrow, file_offset, byte_count, rows, thisrow, i;
- bytesperrow = (long) ptr->samplesperrow * sizeof(JSAMPLE);
+ bytesperrow = (long)ptr->samplesperrow * sizeof(JSAMPLE);
file_offset = ptr->cur_start_row * bytesperrow;
/* Loop to read or write each allocation chunk in mem_buffer */
- for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
+ for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
/* One chunk, but check for short chunk at end of buffer */
- rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i);
+ rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
/* Transfer no more than is currently defined */
- thisrow = (long) ptr->cur_start_row + i;
- rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
+ thisrow = (long)ptr->cur_start_row + i;
+ rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
/* Transfer no more than fits in file */
- rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
+ rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
if (rows <= 0) /* this chunk might be past end of file! */
break;
byte_count = rows * bytesperrow;
if (writing)
- (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
- (void *) ptr->mem_buffer[i],
+ (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+ (void *)ptr->mem_buffer[i],
file_offset, byte_count);
else
- (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
- (void *) ptr->mem_buffer[i],
+ (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+ (void *)ptr->mem_buffer[i],
file_offset, byte_count);
file_offset += byte_count;
}
@@ -783,32 +777,32 @@ do_sarray_io (j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
LOCAL(void)
-do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
+do_barray_io(j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
/* Do backing store read or write of a virtual coefficient-block array */
{
long bytesperrow, file_offset, byte_count, rows, thisrow, i;
- bytesperrow = (long) ptr->blocksperrow * sizeof(JBLOCK);
+ bytesperrow = (long)ptr->blocksperrow * sizeof(JBLOCK);
file_offset = ptr->cur_start_row * bytesperrow;
/* Loop to read or write each allocation chunk in mem_buffer */
- for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
+ for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
/* One chunk, but check for short chunk at end of buffer */
- rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i);
+ rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
/* Transfer no more than is currently defined */
- thisrow = (long) ptr->cur_start_row + i;
- rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
+ thisrow = (long)ptr->cur_start_row + i;
+ rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
/* Transfer no more than fits in file */
- rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
+ rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
if (rows <= 0) /* this chunk might be past end of file! */
break;
byte_count = rows * bytesperrow;
if (writing)
- (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
- (void *) ptr->mem_buffer[i],
+ (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+ (void *)ptr->mem_buffer[i],
file_offset, byte_count);
else
- (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
- (void *) ptr->mem_buffer[i],
+ (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+ (void *)ptr->mem_buffer[i],
file_offset, byte_count);
file_offset += byte_count;
}
@@ -816,9 +810,8 @@ do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
METHODDEF(JSAMPARRAY)
-access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
- JDIMENSION start_row, JDIMENSION num_rows,
- boolean writable)
+access_virt_sarray(j_common_ptr cinfo, jvirt_sarray_ptr ptr,
+ JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
/* Access the part of a virtual sample array starting at start_row */
/* and extending for num_rows rows. writable is true if */
/* caller intends to modify the accessed area. */
@@ -833,8 +826,8 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
/* Make the desired part of the virtual array accessible */
if (start_row < ptr->cur_start_row ||
- end_row > ptr->cur_start_row+ptr->rows_in_mem) {
- if (! ptr->b_s_open)
+ end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+ if (!ptr->b_s_open)
ERREXIT(cinfo, JERR_VIRTUAL_BUG);
/* Flush old buffer contents if necessary */
if (ptr->dirty) {
@@ -854,10 +847,10 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
/* use long arithmetic here to avoid overflow & unsigned problems */
long ltemp;
- ltemp = (long) end_row - (long) ptr->rows_in_mem;
+ ltemp = (long)end_row - (long)ptr->rows_in_mem;
if (ltemp < 0)
ltemp = 0; /* don't fall off front end of file */
- ptr->cur_start_row = (JDIMENSION) ltemp;
+ ptr->cur_start_row = (JDIMENSION)ltemp;
}
/* Read in the selected part of the array.
* During the initial write pass, we will do no actual read
@@ -880,15 +873,15 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
if (writable)
ptr->first_undef_row = end_row;
if (ptr->pre_zero) {
- size_t bytesperrow = (size_t) ptr->samplesperrow * sizeof(JSAMPLE);
+ size_t bytesperrow = (size_t)ptr->samplesperrow * sizeof(JSAMPLE);
undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
end_row -= ptr->cur_start_row;
while (undef_row < end_row) {
- jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+ jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
undef_row++;
}
} else {
- if (! writable) /* reader looking at undefined data */
+ if (!writable) /* reader looking at undefined data */
ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
}
}
@@ -901,9 +894,8 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
METHODDEF(JBLOCKARRAY)
-access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
- JDIMENSION start_row, JDIMENSION num_rows,
- boolean writable)
+access_virt_barray(j_common_ptr cinfo, jvirt_barray_ptr ptr,
+ JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
/* Access the part of a virtual block array starting at start_row */
/* and extending for num_rows rows. writable is true if */
/* caller intends to modify the accessed area. */
@@ -918,8 +910,8 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
/* Make the desired part of the virtual array accessible */
if (start_row < ptr->cur_start_row ||
- end_row > ptr->cur_start_row+ptr->rows_in_mem) {
- if (! ptr->b_s_open)
+ end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+ if (!ptr->b_s_open)
ERREXIT(cinfo, JERR_VIRTUAL_BUG);
/* Flush old buffer contents if necessary */
if (ptr->dirty) {
@@ -939,10 +931,10 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
/* use long arithmetic here to avoid overflow & unsigned problems */
long ltemp;
- ltemp = (long) end_row - (long) ptr->rows_in_mem;
+ ltemp = (long)end_row - (long)ptr->rows_in_mem;
if (ltemp < 0)
ltemp = 0; /* don't fall off front end of file */
- ptr->cur_start_row = (JDIMENSION) ltemp;
+ ptr->cur_start_row = (JDIMENSION)ltemp;
}
/* Read in the selected part of the array.
* During the initial write pass, we will do no actual read
@@ -965,15 +957,15 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
if (writable)
ptr->first_undef_row = end_row;
if (ptr->pre_zero) {
- size_t bytesperrow = (size_t) ptr->blocksperrow * sizeof(JBLOCK);
+ size_t bytesperrow = (size_t)ptr->blocksperrow * sizeof(JBLOCK);
undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
end_row -= ptr->cur_start_row;
while (undef_row < end_row) {
- jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+ jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
undef_row++;
}
} else {
- if (! writable) /* reader looking at undefined data */
+ if (!writable) /* reader looking at undefined data */
ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
}
}
@@ -990,9 +982,9 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
*/
METHODDEF(void)
-free_pool (j_common_ptr cinfo, int pool_id)
+free_pool(j_common_ptr cinfo, int pool_id)
{
- my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
small_pool_ptr shdr_ptr;
large_pool_ptr lhdr_ptr;
size_t space_freed;
@@ -1013,14 +1005,14 @@ free_pool (j_common_ptr cinfo, int pool_id)
for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
if (sptr->b_s_open) { /* there may be no backing store */
sptr->b_s_open = FALSE; /* prevent recursive close if error */
- (*sptr->b_s_info.close_backing_store) (cinfo, & sptr->b_s_info);
+ (*sptr->b_s_info.close_backing_store) (cinfo, &sptr->b_s_info);
}
}
mem->virt_sarray_list = NULL;
for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
if (bptr->b_s_open) { /* there may be no backing store */
bptr->b_s_open = FALSE; /* prevent recursive close if error */
- (*bptr->b_s_info.close_backing_store) (cinfo, & bptr->b_s_info);
+ (*bptr->b_s_info.close_backing_store) (cinfo, &bptr->b_s_info);
}
}
mem->virt_barray_list = NULL;
@@ -1034,8 +1026,8 @@ free_pool (j_common_ptr cinfo, int pool_id)
large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
space_freed = lhdr_ptr->bytes_used +
lhdr_ptr->bytes_left +
- sizeof(large_pool_hdr);
- jpeg_free_large(cinfo, (void *) lhdr_ptr, space_freed);
+ sizeof(large_pool_hdr) + ALIGN_SIZE - 1;
+ jpeg_free_large(cinfo, (void *)lhdr_ptr, space_freed);
mem->total_space_allocated -= space_freed;
lhdr_ptr = next_lhdr_ptr;
}
@@ -1046,10 +1038,9 @@ free_pool (j_common_ptr cinfo, int pool_id)
while (shdr_ptr != NULL) {
small_pool_ptr next_shdr_ptr = shdr_ptr->next;
- space_freed = shdr_ptr->bytes_used +
- shdr_ptr->bytes_left +
- sizeof(small_pool_hdr);
- jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
+ space_freed = shdr_ptr->bytes_used + shdr_ptr->bytes_left +
+ sizeof(small_pool_hdr) + ALIGN_SIZE - 1;
+ jpeg_free_small(cinfo, (void *)shdr_ptr, space_freed);
mem->total_space_allocated -= space_freed;
shdr_ptr = next_shdr_ptr;
}
@@ -1062,7 +1053,7 @@ free_pool (j_common_ptr cinfo, int pool_id)
*/
METHODDEF(void)
-self_destruct (j_common_ptr cinfo)
+self_destruct(j_common_ptr cinfo)
{
int pool;
@@ -1070,12 +1061,12 @@ self_destruct (j_common_ptr cinfo)
* Releasing pools in reverse order might help avoid fragmentation
* with some (brain-damaged) malloc libraries.
*/
- for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) {
+ for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
free_pool(cinfo, pool);
}
/* Release the memory manager control block too. */
- jpeg_free_small(cinfo, (void *) cinfo->mem, sizeof(my_memory_mgr));
+ jpeg_free_small(cinfo, (void *)cinfo->mem, sizeof(my_memory_mgr));
cinfo->mem = NULL; /* ensures I will be called only once */
jpeg_mem_term(cinfo); /* system-dependent cleanup */
@@ -1088,7 +1079,7 @@ self_destruct (j_common_ptr cinfo)
*/
GLOBAL(void)
-jinit_memory_mgr (j_common_ptr cinfo)
+jinit_memory_mgr(j_common_ptr cinfo)
{
my_mem_ptr mem;
long max_to_use;
@@ -1104,22 +1095,22 @@ jinit_memory_mgr (j_common_ptr cinfo)
* in common if and only if X is a power of 2, ie has only one one-bit.
* Some compilers may give an "unreachable code" warning here; ignore it.
*/
- if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
+ if ((ALIGN_SIZE & (ALIGN_SIZE - 1)) != 0)
ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
/* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
* a multiple of ALIGN_SIZE.
* Again, an "unreachable code" warning may be ignored here.
* But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
*/
- test_mac = (size_t) MAX_ALLOC_CHUNK;
- if ((long) test_mac != MAX_ALLOC_CHUNK ||
+ test_mac = (size_t)MAX_ALLOC_CHUNK;
+ if ((long)test_mac != MAX_ALLOC_CHUNK ||
(MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
/* Attempt to allocate memory manager's control block */
- mem = (my_mem_ptr) jpeg_get_small(cinfo, sizeof(my_memory_mgr));
+ mem = (my_mem_ptr)jpeg_get_small(cinfo, sizeof(my_memory_mgr));
if (mem == NULL) {
jpeg_mem_term(cinfo); /* system-dependent cleanup */
@@ -1145,7 +1136,7 @@ jinit_memory_mgr (j_common_ptr cinfo)
/* Initialize working state */
mem->pub.max_memory_to_use = max_to_use;
- for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) {
+ for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
mem->small_list[pool] = NULL;
mem->large_list[pool] = NULL;
}
@@ -1155,7 +1146,7 @@ jinit_memory_mgr (j_common_ptr cinfo)
mem->total_space_allocated = sizeof(my_memory_mgr);
/* Declare ourselves open for business */
- cinfo->mem = & mem->pub;
+ cinfo->mem = &mem->pub;
/* Check for an environment variable JPEGMEM; if found, override the
* default max_memory setting from jpeg_mem_init. Note that the
@@ -1164,12 +1155,17 @@ jinit_memory_mgr (j_common_ptr cinfo)
* this feature.
*/
#ifndef NO_GETENV
- { char *memenv;
+ {
+ char memenv[30] = { 0 };
- if ((memenv = getenv("JPEGMEM")) != NULL) {
+ if (!GETENV_S(memenv, 30, "JPEGMEM") && strlen(memenv) > 0) {
char ch = 'x';
+#ifdef _MSC_VER
+ if (sscanf_s(memenv, "%ld%c", &max_to_use, &ch, 1) > 0) {
+#else
if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) {
+#endif
if (ch == 'm' || ch == 'M')
max_to_use *= 1000L;
mem->pub.max_memory_to_use = max_to_use * 1000L;
diff --git a/media/libjpeg/jmemnobs.c b/media/libjpeg/jmemnobs.c
index 5797198de8..cd6571ba1c 100644
--- a/media/libjpeg/jmemnobs.c
+++ b/media/libjpeg/jmemnobs.c
@@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1992-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code and
- * information relevant to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2017-2018, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -15,7 +15,6 @@
* This is very portable in the sense that it'll compile on almost anything,
* but you'd better have lots of main memory (or virtual memory) if you want
* to process big images.
- * Note that the max_memory_to_use option is ignored by this implementation.
*/
#define JPEG_INTERNALS
@@ -23,11 +22,6 @@
#include "jpeglib.h"
#include "jmemsys.h" /* import the system-dependent declarations */
-#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc (size_t size);
-extern void free (void *ptr);
-#endif
-
/*
* Memory allocation and freeing are controlled by the regular library
@@ -35,13 +29,13 @@ extern void free (void *ptr);
*/
GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
+jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject)
{
- return (void *) malloc(sizeofobject);
+ return (void *)malloc(sizeofobject);
}
GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject)
+jpeg_free_small(j_common_ptr cinfo, void *object, size_t sizeofobject)
{
free(object);
}
@@ -52,13 +46,13 @@ jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject)
*/
GLOBAL(void *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
+jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject)
{
- return (void *) malloc(sizeofobject);
+ return (void *)malloc(sizeofobject);
}
GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject)
+jpeg_free_large(j_common_ptr cinfo, void *object, size_t sizeofobject)
{
free(object);
}
@@ -66,14 +60,21 @@ jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject)
/*
* This routine computes the total memory space available for allocation.
- * Here we always say, "we got all you want bud!"
*/
GLOBAL(size_t)
-jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
- size_t max_bytes_needed, size_t already_allocated)
+jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+ size_t max_bytes_needed, size_t already_allocated)
{
- return max_bytes_needed;
+ if (cinfo->mem->max_memory_to_use) {
+ if ((size_t)cinfo->mem->max_memory_to_use > already_allocated)
+ return cinfo->mem->max_memory_to_use - already_allocated;
+ else
+ return 0;
+ } else {
+ /* Here we always say, "we got all you want bud!" */
+ return max_bytes_needed;
+ }
}
@@ -84,8 +85,8 @@ jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
*/
GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- long total_bytes_needed)
+jpeg_open_backing_store(j_common_ptr cinfo, backing_store_ptr info,
+ long total_bytes_needed)
{
ERREXIT(cinfo, JERR_NO_BACKING_STORE);
}
@@ -97,13 +98,13 @@ jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
*/
GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
+jpeg_mem_init(j_common_ptr cinfo)
{
return 0; /* just set max_memory_to_use to 0 */
}
GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
+jpeg_mem_term(j_common_ptr cinfo)
{
/* no work */
}
diff --git a/media/libjpeg/jmemsys.h b/media/libjpeg/jmemsys.h
index f7dfe87a83..9229550afd 100644
--- a/media/libjpeg/jmemsys.h
+++ b/media/libjpeg/jmemsys.h
@@ -31,9 +31,9 @@
* size of the object being freed, just in case it's needed.
*/
-EXTERN(void *) jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object,
- size_t sizeofobject);
+EXTERN(void *) jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_small(j_common_ptr cinfo, void *object,
+ size_t sizeofobject);
/*
* These two functions are used to allocate and release large chunks of
@@ -43,9 +43,9 @@ EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object,
* large chunks.
*/
-EXTERN(void *) jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object,
- size_t sizeofobject);
+EXTERN(void *) jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_large(j_common_ptr cinfo, void *object,
+ size_t sizeofobject);
/*
* The macro MAX_ALLOC_CHUNK designates the maximum number of bytes that may
@@ -84,9 +84,9 @@ EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object,
* Conversely, zero may be returned to always use the minimum amount of memory.
*/
-EXTERN(size_t) jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
- size_t max_bytes_needed,
- size_t already_allocated);
+EXTERN(size_t) jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+ size_t max_bytes_needed,
+ size_t already_allocated);
/*
@@ -157,9 +157,9 @@ typedef struct backing_store_struct {
* just take an error exit.)
*/
-EXTERN(void) jpeg_open_backing_store (j_common_ptr cinfo,
- backing_store_ptr info,
- long total_bytes_needed);
+EXTERN(void) jpeg_open_backing_store(j_common_ptr cinfo,
+ backing_store_ptr info,
+ long total_bytes_needed);
/*
@@ -174,5 +174,5 @@ EXTERN(void) jpeg_open_backing_store (j_common_ptr cinfo,
* all opened backing-store objects have been closed.
*/
-EXTERN(long) jpeg_mem_init (j_common_ptr cinfo);
-EXTERN(void) jpeg_mem_term (j_common_ptr cinfo);
+EXTERN(long) jpeg_mem_init(j_common_ptr cinfo);
+EXTERN(void) jpeg_mem_term(j_common_ptr cinfo);
diff --git a/media/libjpeg/jmorecfg.h b/media/libjpeg/jmorecfg.h
index d73b1090d5..8cda8041b2 100644
--- a/media/libjpeg/jmorecfg.h
+++ b/media/libjpeg/jmorecfg.h
@@ -5,7 +5,7 @@
* Copyright (C) 1991-1997, Thomas G. Lane.
* Modified 1997-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2014-2015, 2018, 2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -18,9 +18,9 @@
/*
* Maximum number of components (color channels) allowed in JPEG image.
- * To meet the letter of the JPEG spec, set this to 255. However, darn
- * few applications need more than 4 channels (maybe 5 for CMYK + alpha
- * mask). We recommend 10 as a reasonable compromise; use 4 if you are
+ * To meet the letter of Rec. ITU-T T.81 | ISO/IEC 10918-1, set this to 255.
+ * However, darn few applications need more than 4 channels (maybe 5 for CMYK +
+ * alpha mask). We recommend 10 as a reasonable compromise; use 4 if you are
* really short on memory. (Each allowed component costs a hundred or so
* bytes of storage, whether actually used in an image or not.)
*/
@@ -44,24 +44,10 @@
#if BITS_IN_JSAMPLE == 8
/* JSAMPLE should be the smallest type that will hold the values 0..255.
- * You can use a signed char by having GETJSAMPLE mask it with 0xFF.
*/
-#ifdef HAVE_UNSIGNED_CHAR
-
typedef unsigned char JSAMPLE;
-#define GETJSAMPLE(value) ((int) (value))
-
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JSAMPLE;
-#ifdef __CHAR_UNSIGNED__
-#define GETJSAMPLE(value) ((int) (value))
-#else
-#define GETJSAMPLE(value) ((int) (value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
+#define GETJSAMPLE(value) ((int)(value))
#define MAXJSAMPLE 255
#define CENTERJSAMPLE 128
@@ -75,7 +61,7 @@ typedef char JSAMPLE;
*/
typedef short JSAMPLE;
-#define GETJSAMPLE(value) ((int) (value))
+#define GETJSAMPLE(value) ((int)(value))
#define MAXJSAMPLE 4095
#define CENTERJSAMPLE 2048
@@ -98,22 +84,9 @@ typedef short JCOEF;
* managers, this is also the data type passed to fread/fwrite.
*/
-#ifdef HAVE_UNSIGNED_CHAR
-
typedef unsigned char JOCTET;
#define GETJOCTET(value) (value)
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JOCTET;
-#ifdef __CHAR_UNSIGNED__
-#define GETJOCTET(value) (value)
-#else
-#define GETJOCTET(value) ((value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
/* These typedefs are used for various table entries and so forth.
* They must be at least as wide as specified; but making them too big
@@ -199,7 +172,7 @@ typedef unsigned int JDIMENSION;
* software out there that uses it.
*/
-#define JMETHOD(type,methodname,arglist) type (*methodname) arglist
+#define JMETHOD(type, methodname, arglist) type (*methodname) arglist
/* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
@@ -252,9 +225,9 @@ typedef int boolean;
/* Capability options common to encoder and decoder: */
-#define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */
-#define DCT_IFAST_SUPPORTED /* faster, less accurate integer method */
-#define DCT_FLOAT_SUPPORTED /* floating-point: accurate, fast on fast HW */
+#define DCT_ISLOW_SUPPORTED /* accurate integer method */
+#define DCT_IFAST_SUPPORTED /* less accurate int method [legacy feature] */
+#define DCT_FLOAT_SUPPORTED /* floating-point method [legacy feature] */
/* Encoder capability options: */
@@ -294,10 +267,10 @@ typedef int boolean;
* with it. In reality, few people ever did this, because there were some
* severe restrictions involved (cjpeg and djpeg no longer worked properly,
* compressing/decompressing RGB JPEGs no longer worked properly, and the color
- * quantizer wouldn't work with pixel sizes other than 3.) Further, since all
- * of the O/S-supplied versions of libjpeg were built with the default values
- * of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications have
- * come to regard these values as immutable.
+ * quantizer wouldn't work with pixel sizes other than 3.) Furthermore, since
+ * all of the O/S-supplied versions of libjpeg were built with the default
+ * values of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications
+ * have come to regard these values as immutable.
*
* The libjpeg-turbo colorspace extensions provide a much cleaner way of
* compressing from/decompressing to buffers with arbitrary component orders
@@ -312,37 +285,37 @@ typedef int boolean;
#define RGB_BLUE 2 /* Offset of Blue */
#define RGB_PIXELSIZE 3 /* JSAMPLEs per RGB scanline element */
-#define JPEG_NUMCS 17
+#define JPEG_NUMCS 17
-#define EXT_RGB_RED 0
-#define EXT_RGB_GREEN 1
-#define EXT_RGB_BLUE 2
-#define EXT_RGB_PIXELSIZE 3
+#define EXT_RGB_RED 0
+#define EXT_RGB_GREEN 1
+#define EXT_RGB_BLUE 2
+#define EXT_RGB_PIXELSIZE 3
-#define EXT_RGBX_RED 0
-#define EXT_RGBX_GREEN 1
-#define EXT_RGBX_BLUE 2
-#define EXT_RGBX_PIXELSIZE 4
+#define EXT_RGBX_RED 0
+#define EXT_RGBX_GREEN 1
+#define EXT_RGBX_BLUE 2
+#define EXT_RGBX_PIXELSIZE 4
-#define EXT_BGR_RED 2
-#define EXT_BGR_GREEN 1
-#define EXT_BGR_BLUE 0
-#define EXT_BGR_PIXELSIZE 3
+#define EXT_BGR_RED 2
+#define EXT_BGR_GREEN 1
+#define EXT_BGR_BLUE 0
+#define EXT_BGR_PIXELSIZE 3
-#define EXT_BGRX_RED 2
-#define EXT_BGRX_GREEN 1
-#define EXT_BGRX_BLUE 0
-#define EXT_BGRX_PIXELSIZE 4
+#define EXT_BGRX_RED 2
+#define EXT_BGRX_GREEN 1
+#define EXT_BGRX_BLUE 0
+#define EXT_BGRX_PIXELSIZE 4
-#define EXT_XBGR_RED 3
-#define EXT_XBGR_GREEN 2
-#define EXT_XBGR_BLUE 1
-#define EXT_XBGR_PIXELSIZE 4
+#define EXT_XBGR_RED 3
+#define EXT_XBGR_GREEN 2
+#define EXT_XBGR_BLUE 1
+#define EXT_XBGR_PIXELSIZE 4
-#define EXT_XRGB_RED 1
-#define EXT_XRGB_GREEN 2
-#define EXT_XRGB_BLUE 3
-#define EXT_XRGB_PIXELSIZE 4
+#define EXT_XRGB_RED 1
+#define EXT_XRGB_GREEN 2
+#define EXT_XRGB_BLUE 3
+#define EXT_XRGB_PIXELSIZE 4
static const int rgb_red[JPEG_NUMCS] = {
-1, -1, RGB_RED, -1, -1, -1, EXT_RGB_RED, EXT_RGBX_RED,
@@ -383,7 +356,7 @@ static const int rgb_pixelsize[JPEG_NUMCS] = {
#ifndef WITH_SIMD
#define MULTIPLIER int /* type for fastest integer multiply */
#else
-#define MULTIPLIER short /* prefer 16-bit with SIMD for parellelism */
+#define MULTIPLIER short /* prefer 16-bit with SIMD for parellelism */
#endif
#endif
diff --git a/media/libjpeg/jpegcomp.h b/media/libjpeg/jpegcomp.h
index ade0d1edcd..c4834ac0df 100644
--- a/media/libjpeg/jpegcomp.h
+++ b/media/libjpeg/jpegcomp.h
@@ -1,7 +1,7 @@
/*
* jpegcomp.h
*
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -11,21 +11,22 @@
*/
#if JPEG_LIB_VERSION >= 70
-#define _DCT_scaled_size DCT_h_scaled_size
-#define _DCT_h_scaled_size DCT_h_scaled_size
-#define _DCT_v_scaled_size DCT_v_scaled_size
-#define _min_DCT_scaled_size min_DCT_h_scaled_size
-#define _min_DCT_h_scaled_size min_DCT_h_scaled_size
-#define _min_DCT_v_scaled_size min_DCT_v_scaled_size
-#define _jpeg_width jpeg_width
-#define _jpeg_height jpeg_height
+#define _DCT_scaled_size DCT_h_scaled_size
+#define _DCT_h_scaled_size DCT_h_scaled_size
+#define _DCT_v_scaled_size DCT_v_scaled_size
+#define _min_DCT_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_v_scaled_size
+#define _jpeg_width jpeg_width
+#define _jpeg_height jpeg_height
+#define JERR_ARITH_NOTIMPL JERR_NOT_COMPILED
#else
-#define _DCT_scaled_size DCT_scaled_size
-#define _DCT_h_scaled_size DCT_scaled_size
-#define _DCT_v_scaled_size DCT_scaled_size
-#define _min_DCT_scaled_size min_DCT_scaled_size
-#define _min_DCT_h_scaled_size min_DCT_scaled_size
-#define _min_DCT_v_scaled_size min_DCT_scaled_size
-#define _jpeg_width image_width
-#define _jpeg_height image_height
+#define _DCT_scaled_size DCT_scaled_size
+#define _DCT_h_scaled_size DCT_scaled_size
+#define _DCT_v_scaled_size DCT_scaled_size
+#define _min_DCT_scaled_size min_DCT_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_scaled_size
+#define _jpeg_width image_width
+#define _jpeg_height image_height
#endif
diff --git a/media/libjpeg/jpegint.h b/media/libjpeg/jpegint.h
index 9979a912d9..6af9e2a179 100644
--- a/media/libjpeg/jpegint.h
+++ b/media/libjpeg/jpegint.h
@@ -5,8 +5,9 @@
* Copyright (C) 1991-1997, Thomas G. Lane.
* Modified 1997-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2019, 2021, D. R. Commander.
* Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2021, Alex Richardson.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -27,33 +28,45 @@ typedef enum { /* Operating modes for buffer controllers */
} J_BUF_MODE;
/* Values of global_state field (jdapi.c has some dependencies on ordering!) */
-#define CSTATE_START 100 /* after create_compress */
-#define CSTATE_SCANNING 101 /* start_compress done, write_scanlines OK */
-#define CSTATE_RAW_OK 102 /* start_compress done, write_raw_data OK */
-#define CSTATE_WRCOEFS 103 /* jpeg_write_coefficients done */
-#define DSTATE_START 200 /* after create_decompress */
-#define DSTATE_INHEADER 201 /* reading header markers, no SOS yet */
-#define DSTATE_READY 202 /* found SOS, ready for start_decompress */
-#define DSTATE_PRELOAD 203 /* reading multiscan file in start_decompress*/
-#define DSTATE_PRESCAN 204 /* performing dummy pass for 2-pass quant */
-#define DSTATE_SCANNING 205 /* start_decompress done, read_scanlines OK */
-#define DSTATE_RAW_OK 206 /* start_decompress done, read_raw_data OK */
-#define DSTATE_BUFIMAGE 207 /* expecting jpeg_start_output */
-#define DSTATE_BUFPOST 208 /* looking for SOS/EOI in jpeg_finish_output */
-#define DSTATE_RDCOEFS 209 /* reading file in jpeg_read_coefficients */
-#define DSTATE_STOPPING 210 /* looking for EOI in jpeg_finish_decompress */
+#define CSTATE_START 100 /* after create_compress */
+#define CSTATE_SCANNING 101 /* start_compress done, write_scanlines OK */
+#define CSTATE_RAW_OK 102 /* start_compress done, write_raw_data OK */
+#define CSTATE_WRCOEFS 103 /* jpeg_write_coefficients done */
+#define DSTATE_START 200 /* after create_decompress */
+#define DSTATE_INHEADER 201 /* reading header markers, no SOS yet */
+#define DSTATE_READY 202 /* found SOS, ready for start_decompress */
+#define DSTATE_PRELOAD 203 /* reading multiscan file in start_decompress*/
+#define DSTATE_PRESCAN 204 /* performing dummy pass for 2-pass quant */
+#define DSTATE_SCANNING 205 /* start_decompress done, read_scanlines OK */
+#define DSTATE_RAW_OK 206 /* start_decompress done, read_raw_data OK */
+#define DSTATE_BUFIMAGE 207 /* expecting jpeg_start_output */
+#define DSTATE_BUFPOST 208 /* looking for SOS/EOI in jpeg_finish_output */
+#define DSTATE_RDCOEFS 209 /* reading file in jpeg_read_coefficients */
+#define DSTATE_STOPPING 210 /* looking for EOI in jpeg_finish_decompress */
/* JLONG must hold at least signed 32-bit values. */
typedef long JLONG;
+/* JUINTPTR must hold pointer values. */
+#ifdef __UINTPTR_TYPE__
+/*
+ * __UINTPTR_TYPE__ is GNU-specific and available in GCC 4.6+ and Clang 3.0+.
+ * Fortunately, that is sufficient to support the few architectures for which
+ * sizeof(void *) != sizeof(size_t). The only other options would require C99
+ * or Clang-specific builtins.
+ */
+typedef __UINTPTR_TYPE__ JUINTPTR;
+#else
+typedef size_t JUINTPTR;
+#endif
/*
* Left shift macro that handles a negative operand without causing any
* sanitizer warnings
*/
-#define LEFT_SHIFT(a, b) ((JLONG)((unsigned long)(a) << (b)))
+#define LEFT_SHIFT(a, b) ((JLONG)((unsigned long)(a) << (b)))
/* Declarations for compression modules */
@@ -158,6 +171,9 @@ struct jpeg_decomp_master {
JDIMENSION first_MCU_col[MAX_COMPONENTS];
JDIMENSION last_MCU_col[MAX_COMPONENTS];
boolean jinit_upsampler_no_alloc;
+
+ /* Last iMCU row that was successfully decoded */
+ JDIMENSION last_good_iMCU_row;
};
/* Input control module */
@@ -274,9 +290,9 @@ struct jpeg_color_quantizer {
/* Miscellaneous useful macros */
#undef MAX
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
#undef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
/* We assume that right shift corresponds to signed division by 2 with
@@ -291,64 +307,64 @@ struct jpeg_color_quantizer {
#ifdef RIGHT_SHIFT_IS_UNSIGNED
#define SHIFT_TEMPS JLONG shift_temp;
-#define RIGHT_SHIFT(x,shft) \
- ((shift_temp = (x)) < 0 ? \
- (shift_temp >> (shft)) | ((~((JLONG) 0)) << (32-(shft))) : \
- (shift_temp >> (shft)))
+#define RIGHT_SHIFT(x, shft) \
+ ((shift_temp = (x)) < 0 ? \
+ (shift_temp >> (shft)) | ((~((JLONG)0)) << (32 - (shft))) : \
+ (shift_temp >> (shft)))
#else
#define SHIFT_TEMPS
-#define RIGHT_SHIFT(x,shft) ((x) >> (shft))
+#define RIGHT_SHIFT(x, shft) ((x) >> (shft))
#endif
/* Compression module initialization routines */
-EXTERN(void) jinit_compress_master (j_compress_ptr cinfo);
-EXTERN(void) jinit_c_master_control (j_compress_ptr cinfo,
- boolean transcode_only);
-EXTERN(void) jinit_c_main_controller (j_compress_ptr cinfo,
- boolean need_full_buffer);
-EXTERN(void) jinit_c_prep_controller (j_compress_ptr cinfo,
- boolean need_full_buffer);
-EXTERN(void) jinit_c_coef_controller (j_compress_ptr cinfo,
- boolean need_full_buffer);
-EXTERN(void) jinit_color_converter (j_compress_ptr cinfo);
-EXTERN(void) jinit_downsampler (j_compress_ptr cinfo);
-EXTERN(void) jinit_forward_dct (j_compress_ptr cinfo);
-EXTERN(void) jinit_huff_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_phuff_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_arith_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_marker_writer (j_compress_ptr cinfo);
+EXTERN(void) jinit_compress_master(j_compress_ptr cinfo);
+EXTERN(void) jinit_c_master_control(j_compress_ptr cinfo,
+ boolean transcode_only);
+EXTERN(void) jinit_c_main_controller(j_compress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_c_prep_controller(j_compress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_c_coef_controller(j_compress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_color_converter(j_compress_ptr cinfo);
+EXTERN(void) jinit_downsampler(j_compress_ptr cinfo);
+EXTERN(void) jinit_forward_dct(j_compress_ptr cinfo);
+EXTERN(void) jinit_huff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_phuff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_arith_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_marker_writer(j_compress_ptr cinfo);
/* Decompression module initialization routines */
-EXTERN(void) jinit_master_decompress (j_decompress_ptr cinfo);
-EXTERN(void) jinit_d_main_controller (j_decompress_ptr cinfo,
- boolean need_full_buffer);
-EXTERN(void) jinit_d_coef_controller (j_decompress_ptr cinfo,
- boolean need_full_buffer);
-EXTERN(void) jinit_d_post_controller (j_decompress_ptr cinfo,
- boolean need_full_buffer);
-EXTERN(void) jinit_input_controller (j_decompress_ptr cinfo);
-EXTERN(void) jinit_marker_reader (j_decompress_ptr cinfo);
-EXTERN(void) jinit_huff_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_phuff_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_arith_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_inverse_dct (j_decompress_ptr cinfo);
-EXTERN(void) jinit_upsampler (j_decompress_ptr cinfo);
-EXTERN(void) jinit_color_deconverter (j_decompress_ptr cinfo);
-EXTERN(void) jinit_1pass_quantizer (j_decompress_ptr cinfo);
-EXTERN(void) jinit_2pass_quantizer (j_decompress_ptr cinfo);
-EXTERN(void) jinit_merged_upsampler (j_decompress_ptr cinfo);
+EXTERN(void) jinit_master_decompress(j_decompress_ptr cinfo);
+EXTERN(void) jinit_d_main_controller(j_decompress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_d_coef_controller(j_decompress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_d_post_controller(j_decompress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_input_controller(j_decompress_ptr cinfo);
+EXTERN(void) jinit_marker_reader(j_decompress_ptr cinfo);
+EXTERN(void) jinit_huff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_phuff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_arith_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_inverse_dct(j_decompress_ptr cinfo);
+EXTERN(void) jinit_upsampler(j_decompress_ptr cinfo);
+EXTERN(void) jinit_color_deconverter(j_decompress_ptr cinfo);
+EXTERN(void) jinit_1pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_2pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_merged_upsampler(j_decompress_ptr cinfo);
/* Memory manager initialization */
-EXTERN(void) jinit_memory_mgr (j_common_ptr cinfo);
+EXTERN(void) jinit_memory_mgr(j_common_ptr cinfo);
/* Utility routines in jutils.c */
-EXTERN(long) jdiv_round_up (long a, long b);
-EXTERN(long) jround_up (long a, long b);
-EXTERN(void) jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
- JSAMPARRAY output_array, int dest_row,
- int num_rows, JDIMENSION num_cols);
-EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
- JDIMENSION num_blocks);
-EXTERN(void) jzero_far (void *target, size_t bytestozero);
+EXTERN(long) jdiv_round_up(long a, long b);
+EXTERN(long) jround_up(long a, long b);
+EXTERN(void) jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+ JSAMPARRAY output_array, int dest_row,
+ int num_rows, JDIMENSION num_cols);
+EXTERN(void) jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+ JDIMENSION num_blocks);
+EXTERN(void) jzero_far(void *target, size_t bytestozero);
/* Constant tables in jutils.c */
#if 0 /* This table is not actually needed in v6a */
extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
@@ -357,12 +373,3 @@ extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
/* Arithmetic coding probability estimation tables in jaricom.c */
extern const JLONG jpeg_aritab[];
-
-/* Suppress undefined-structure complaints if necessary. */
-
-#ifdef INCOMPLETE_TYPES_BROKEN
-#ifndef AM_MEMORY_MANAGER /* only jmemmgr.c defines these */
-struct jvirt_sarray_control { long dummy; };
-struct jvirt_barray_control { long dummy; };
-#endif
-#endif /* INCOMPLETE_TYPES_BROKEN */
diff --git a/media/libjpeg/jpeglib.h b/media/libjpeg/jpeglib.h
index 6c63f58222..d7664f0630 100644
--- a/media/libjpeg/jpeglib.h
+++ b/media/libjpeg/jpeglib.h
@@ -5,7 +5,7 @@
* Copyright (C) 1991-1998, Thomas G. Lane.
* Modified 2002-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2013-2014, 2016-2017, 2020, D. R. Commander.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
@@ -211,8 +211,8 @@ struct jpeg_marker_struct {
/* Known color spaces. */
-#define JCS_EXTENSIONS 1
-#define JCS_ALPHA_EXTENSIONS 1
+#define JCS_EXTENSIONS 1
+#define JCS_ALPHA_EXTENSIONS 1
typedef enum {
JCS_UNKNOWN, /* error/unspecified */
@@ -244,9 +244,9 @@ typedef enum {
/* DCT/IDCT algorithm options. */
typedef enum {
- JDCT_ISLOW, /* slow but accurate integer algorithm */
- JDCT_IFAST, /* faster, less accurate integer method */
- JDCT_FLOAT /* floating-point: accurate, fast on fast HW */
+ JDCT_ISLOW, /* accurate integer method */
+ JDCT_IFAST, /* less accurate integer method [legacy feature] */
+ JDCT_FLOAT /* floating-point method [legacy feature] */
} J_DCT_METHOD;
#ifndef JDCT_DEFAULT /* may be overridden in jconfig.h */
@@ -268,11 +268,11 @@ typedef enum {
/* Common fields between JPEG compression and decompression master structs. */
#define jpeg_common_fields \
- struct jpeg_error_mgr *err; /* Error handler module */\
- struct jpeg_memory_mgr *mem; /* Memory manager module */\
- struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */\
- void *client_data; /* Available for use by application */\
- boolean is_decompressor; /* So common code can tell which is which */\
+ struct jpeg_error_mgr *err; /* Error handler module */ \
+ struct jpeg_memory_mgr *mem; /* Memory manager module */ \
+ struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */ \
+ void *client_data; /* Available for use by application */ \
+ boolean is_decompressor; /* So common code can tell which is which */ \
int global_state /* For checking call sequence validity */
/* Routines that are to be used by both halves of the library are declared
@@ -822,9 +822,9 @@ struct jpeg_source_mgr {
* successful.
*/
-#define JPOOL_PERMANENT 0 /* lasts until master record is destroyed */
-#define JPOOL_IMAGE 1 /* lasts until done with image/datastream */
-#define JPOOL_NUMPOOLS 2
+#define JPOOL_PERMANENT 0 /* lasts until master record is destroyed */
+#define JPOOL_IMAGE 1 /* lasts until done with image/datastream */
+#define JPOOL_NUMPOOLS 2
typedef struct jvirt_sarray_control *jvirt_sarray_ptr;
typedef struct jvirt_barray_control *jvirt_barray_ptr;
@@ -888,7 +888,7 @@ typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
/* Default error-management setup */
-EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err);
+EXTERN(struct jpeg_error_mgr *) jpeg_std_error(struct jpeg_error_mgr *err);
/* Initialization of JPEG compression objects.
* jpeg_create_compress() and jpeg_create_decompress() are the exported
@@ -898,90 +898,95 @@ EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err);
* NB: you must set up the error-manager BEFORE calling jpeg_create_xxx.
*/
#define jpeg_create_compress(cinfo) \
- jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
- (size_t) sizeof(struct jpeg_compress_struct))
+ jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+ (size_t)sizeof(struct jpeg_compress_struct))
#define jpeg_create_decompress(cinfo) \
- jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
- (size_t) sizeof(struct jpeg_decompress_struct))
-EXTERN(void) jpeg_CreateCompress (j_compress_ptr cinfo, int version,
- size_t structsize);
-EXTERN(void) jpeg_CreateDecompress (j_decompress_ptr cinfo, int version,
- size_t structsize);
+ jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+ (size_t)sizeof(struct jpeg_decompress_struct))
+EXTERN(void) jpeg_CreateCompress(j_compress_ptr cinfo, int version,
+ size_t structsize);
+EXTERN(void) jpeg_CreateDecompress(j_decompress_ptr cinfo, int version,
+ size_t structsize);
/* Destruction of JPEG compression objects */
-EXTERN(void) jpeg_destroy_compress (j_compress_ptr cinfo);
-EXTERN(void) jpeg_destroy_decompress (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_destroy_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_destroy_decompress(j_decompress_ptr cinfo);
/* Standard data source and destination managers: stdio streams. */
/* Caller is responsible for opening the file before and closing after. */
-EXTERN(void) jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile);
-EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile);
+EXTERN(void) jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile);
+EXTERN(void) jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile);
#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
/* Data source and destination managers: memory buffers. */
-EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char **outbuffer,
- unsigned long *outsize);
-EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo,
- const unsigned char *inbuffer,
- unsigned long insize);
+EXTERN(void) jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+ unsigned long *outsize);
+EXTERN(void) jpeg_mem_src(j_decompress_ptr cinfo,
+ const unsigned char *inbuffer, unsigned long insize);
#endif
/* Default parameter setup for compression */
-EXTERN(void) jpeg_set_defaults (j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_defaults(j_compress_ptr cinfo);
/* Compression parameter setup aids */
-EXTERN(void) jpeg_set_colorspace (j_compress_ptr cinfo,
- J_COLOR_SPACE colorspace);
-EXTERN(void) jpeg_default_colorspace (j_compress_ptr cinfo);
-EXTERN(void) jpeg_set_quality (j_compress_ptr cinfo, int quality,
- boolean force_baseline);
-EXTERN(void) jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
- boolean force_baseline);
+EXTERN(void) jpeg_set_colorspace(j_compress_ptr cinfo,
+ J_COLOR_SPACE colorspace);
+EXTERN(void) jpeg_default_colorspace(j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_quality(j_compress_ptr cinfo, int quality,
+ boolean force_baseline);
+EXTERN(void) jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+ boolean force_baseline);
#if JPEG_LIB_VERSION >= 70
-EXTERN(void) jpeg_default_qtables (j_compress_ptr cinfo,
- boolean force_baseline);
+EXTERN(void) jpeg_default_qtables(j_compress_ptr cinfo,
+ boolean force_baseline);
#endif
-EXTERN(void) jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
- const unsigned int *basic_table,
- int scale_factor, boolean force_baseline);
-EXTERN(int) jpeg_quality_scaling (int quality);
-EXTERN(void) jpeg_simple_progression (j_compress_ptr cinfo);
-EXTERN(void) jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress);
-EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table (j_common_ptr cinfo);
-EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table (j_common_ptr cinfo);
+EXTERN(void) jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+ const unsigned int *basic_table,
+ int scale_factor, boolean force_baseline);
+EXTERN(int) jpeg_quality_scaling(int quality);
+EXTERN(void) jpeg_simple_progression(j_compress_ptr cinfo);
+EXTERN(void) jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress);
+EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table(j_common_ptr cinfo);
+EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table(j_common_ptr cinfo);
/* Main entry points for compression */
-EXTERN(void) jpeg_start_compress (j_compress_ptr cinfo,
- boolean write_all_tables);
-EXTERN(JDIMENSION) jpeg_write_scanlines (j_compress_ptr cinfo,
- JSAMPARRAY scanlines,
- JDIMENSION num_lines);
-EXTERN(void) jpeg_finish_compress (j_compress_ptr cinfo);
+EXTERN(void) jpeg_start_compress(j_compress_ptr cinfo,
+ boolean write_all_tables);
+EXTERN(JDIMENSION) jpeg_write_scanlines(j_compress_ptr cinfo,
+ JSAMPARRAY scanlines,
+ JDIMENSION num_lines);
+EXTERN(void) jpeg_finish_compress(j_compress_ptr cinfo);
#if JPEG_LIB_VERSION >= 70
/* Precalculate JPEG dimensions for current compression parameters. */
-EXTERN(void) jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo);
+EXTERN(void) jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo);
#endif
/* Replaces jpeg_write_scanlines when writing raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
- JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+ JDIMENSION num_lines);
/* Write a special marker. See libjpeg.txt concerning safe usage. */
-EXTERN(void) jpeg_write_marker (j_compress_ptr cinfo, int marker,
- const JOCTET *dataptr, unsigned int datalen);
+EXTERN(void) jpeg_write_marker(j_compress_ptr cinfo, int marker,
+ const JOCTET *dataptr, unsigned int datalen);
/* Same, but piecemeal. */
-EXTERN(void) jpeg_write_m_header (j_compress_ptr cinfo, int marker,
- unsigned int datalen);
-EXTERN(void) jpeg_write_m_byte (j_compress_ptr cinfo, int val);
+EXTERN(void) jpeg_write_m_header(j_compress_ptr cinfo, int marker,
+ unsigned int datalen);
+EXTERN(void) jpeg_write_m_byte(j_compress_ptr cinfo, int val);
/* Alternate compression function: just write an abbreviated table file */
-EXTERN(void) jpeg_write_tables (j_compress_ptr cinfo);
+EXTERN(void) jpeg_write_tables(j_compress_ptr cinfo);
+
+/* Write ICC profile. See libjpeg.txt for usage information. */
+EXTERN(void) jpeg_write_icc_profile(j_compress_ptr cinfo,
+ const JOCTET *icc_data_ptr,
+ unsigned int icc_data_len);
+
/* Decompression startup: read start of JPEG datastream to see what's there */
-EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
+EXTERN(int) jpeg_read_header(j_decompress_ptr cinfo, boolean require_image);
/* Return value is one of: */
-#define JPEG_SUSPENDED 0 /* Suspended due to lack of input data */
-#define JPEG_HEADER_OK 1 /* Found valid image datastream */
-#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */
+#define JPEG_SUSPENDED 0 /* Suspended due to lack of input data */
+#define JPEG_HEADER_OK 1 /* Found valid image datastream */
+#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */
/* If you pass require_image = TRUE (normal case), you need not check for
* a TABLES_ONLY return code; an abbreviated file will cause an error exit.
* JPEG_SUSPENDED is only possible if you use a data source module that can
@@ -989,27 +994,27 @@ EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
*/
/* Main entry points for decompression */
-EXTERN(boolean) jpeg_start_decompress (j_decompress_ptr cinfo);
-EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo,
- JSAMPARRAY scanlines,
- JDIMENSION max_lines);
-EXTERN(JDIMENSION) jpeg_skip_scanlines (j_decompress_ptr cinfo,
- JDIMENSION num_lines);
-EXTERN(void) jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
- JDIMENSION *width);
-EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_decompress(j_decompress_ptr cinfo);
+EXTERN(JDIMENSION) jpeg_read_scanlines(j_decompress_ptr cinfo,
+ JSAMPARRAY scanlines,
+ JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_skip_scanlines(j_decompress_ptr cinfo,
+ JDIMENSION num_lines);
+EXTERN(void) jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+ JDIMENSION *width);
+EXTERN(boolean) jpeg_finish_decompress(j_decompress_ptr cinfo);
/* Replaces jpeg_read_scanlines when reading raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
- JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+ JDIMENSION max_lines);
/* Additional entry points for buffered-image mode. */
-EXTERN(boolean) jpeg_has_multiple_scans (j_decompress_ptr cinfo);
-EXTERN(boolean) jpeg_start_output (j_decompress_ptr cinfo, int scan_number);
-EXTERN(boolean) jpeg_finish_output (j_decompress_ptr cinfo);
-EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo);
-EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo);
-EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_has_multiple_scans(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_output(j_decompress_ptr cinfo, int scan_number);
+EXTERN(boolean) jpeg_finish_output(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_input_complete(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_new_colormap(j_decompress_ptr cinfo);
+EXTERN(int) jpeg_consume_input(j_decompress_ptr cinfo);
/* Return value is one of: */
/* #define JPEG_SUSPENDED 0 Suspended due to lack of input data */
#define JPEG_REACHED_SOS 1 /* Reached start of new scan */
@@ -1019,25 +1024,25 @@ EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
/* Precalculate output dimensions for current decompression parameters. */
#if JPEG_LIB_VERSION >= 80
-EXTERN(void) jpeg_core_output_dimensions (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_core_output_dimensions(j_decompress_ptr cinfo);
#endif
-EXTERN(void) jpeg_calc_output_dimensions (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_calc_output_dimensions(j_decompress_ptr cinfo);
/* Control saving of COM and APPn markers into marker_list. */
-EXTERN(void) jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
- unsigned int length_limit);
+EXTERN(void) jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+ unsigned int length_limit);
/* Install a special processing method for COM or APPn markers. */
-EXTERN(void) jpeg_set_marker_processor (j_decompress_ptr cinfo,
- int marker_code,
- jpeg_marker_parser_method routine);
+EXTERN(void) jpeg_set_marker_processor(j_decompress_ptr cinfo,
+ int marker_code,
+ jpeg_marker_parser_method routine);
/* Read or write raw DCT coefficients --- useful for lossless transcoding. */
-EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo);
-EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo,
- jvirt_barray_ptr *coef_arrays);
-EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
- j_compress_ptr dstinfo);
+EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_write_coefficients(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays);
+EXTERN(void) jpeg_copy_critical_parameters(j_decompress_ptr srcinfo,
+ j_compress_ptr dstinfo);
/* If you choose to abort compression or decompression before completing
* jpeg_finish_(de)compress, then you need to clean up to release memory,
@@ -1045,17 +1050,22 @@ EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
* if you're done with the JPEG object, but if you want to clean it up and
* reuse it, call this:
*/
-EXTERN(void) jpeg_abort_compress (j_compress_ptr cinfo);
-EXTERN(void) jpeg_abort_decompress (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_abort_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_abort_decompress(j_decompress_ptr cinfo);
/* Generic versions of jpeg_abort and jpeg_destroy that work on either
* flavor of JPEG object. These may be more convenient in some places.
*/
-EXTERN(void) jpeg_abort (j_common_ptr cinfo);
-EXTERN(void) jpeg_destroy (j_common_ptr cinfo);
+EXTERN(void) jpeg_abort(j_common_ptr cinfo);
+EXTERN(void) jpeg_destroy(j_common_ptr cinfo);
/* Default restart-marker-resync procedure for use by data source modules */
-EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired);
+EXTERN(boolean) jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired);
+
+/* Read ICC profile. See libjpeg.txt for usage information. */
+EXTERN(boolean) jpeg_read_icc_profile(j_decompress_ptr cinfo,
+ JOCTET **icc_data_ptr,
+ unsigned int *icc_data_len);
/* These marker codes are exported since applications and data source modules
diff --git a/media/libjpeg/jquant1.c b/media/libjpeg/jquant1.c
index e7814815ef..73b83e16e5 100644
--- a/media/libjpeg/jquant1.c
+++ b/media/libjpeg/jquant1.c
@@ -73,8 +73,9 @@
#define ODITHER_SIZE 16 /* dimension of dither matrix */
/* NB: if ODITHER_SIZE is not a power of 2, ODITHER_MASK uses will break */
-#define ODITHER_CELLS (ODITHER_SIZE*ODITHER_SIZE) /* # cells in matrix */
-#define ODITHER_MASK (ODITHER_SIZE-1) /* mask for wrapping around counters */
+#define ODITHER_CELLS (ODITHER_SIZE * ODITHER_SIZE) /* # cells in matrix */
+#define ODITHER_MASK (ODITHER_SIZE - 1) /* mask for wrapping around
+ counters */
typedef int ODITHER_MATRIX[ODITHER_SIZE][ODITHER_SIZE];
typedef int (*ODITHER_MATRIX_PTR)[ODITHER_SIZE];
@@ -132,12 +133,12 @@ typedef JLONG FSERROR; /* may need more than 16 bits */
typedef JLONG LOCFSERROR; /* be sure calculation temps are big enough */
#endif
-typedef FSERROR *FSERRPTR; /* pointer to error array */
+typedef FSERROR *FSERRPTR; /* pointer to error array */
/* Private subobject */
-#define MAX_Q_COMPS 4 /* max components I can handle */
+#define MAX_Q_COMPS 4 /* max components I can handle */
typedef struct {
struct jpeg_color_quantizer pub; /* public fields */
@@ -153,7 +154,7 @@ typedef struct {
*/
boolean is_padded; /* is the colorindex padded for odither? */
- int Ncolors[MAX_Q_COMPS]; /* # of values alloced to each component */
+ int Ncolors[MAX_Q_COMPS]; /* # of values allocated to each component */
/* Variables for ordered dithering */
int row_index; /* cur row's vertical index in dither matrix */
@@ -183,7 +184,7 @@ typedef my_cquantizer *my_cquantize_ptr;
LOCAL(int)
-select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
+select_ncolors(j_decompress_ptr cinfo, int Ncolors[])
/* Determine allocation of desired colors to components, */
/* and fill in Ncolors[] array to indicate choice. */
/* Return value is total number of colors (product of Ncolors[] values). */
@@ -206,12 +207,12 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
temp = iroot; /* set temp = iroot ** nc */
for (i = 1; i < nc; i++)
temp *= iroot;
- } while (temp <= (long) max_colors); /* repeat till iroot exceeds root */
+ } while (temp <= (long)max_colors); /* repeat till iroot exceeds root */
iroot--; /* now iroot = floor(root) */
/* Must have at least 2 color values per component */
if (iroot < 2)
- ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int) temp);
+ ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int)temp);
/* Initialize to iroot color values for each component */
total_colors = 1;
@@ -231,11 +232,11 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
j = (cinfo->out_color_space == JCS_RGB ? RGB_order[i] : i);
/* calculate new total_colors if Ncolors[j] is incremented */
temp = total_colors / Ncolors[j];
- temp *= Ncolors[j]+1; /* done in long arith to avoid oflo */
- if (temp > (long) max_colors)
+ temp *= Ncolors[j] + 1; /* done in long arith to avoid oflo */
+ if (temp > (long)max_colors)
break; /* won't fit, done with this pass */
Ncolors[j]++; /* OK, apply the increment */
- total_colors = (int) temp;
+ total_colors = (int)temp;
changed = TRUE;
}
} while (changed);
@@ -245,7 +246,7 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
LOCAL(int)
-output_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
+output_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
/* Return j'th output value, where j will range from 0 to maxj */
/* The output values must fall in 0..MAXJSAMPLE in increasing order */
{
@@ -254,17 +255,17 @@ output_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
* (Forcing the upper and lower values to the limits ensures that
* dithering can't produce a color outside the selected gamut.)
*/
- return (int) (((JLONG) j * MAXJSAMPLE + maxj/2) / maxj);
+ return (int)(((JLONG)j * MAXJSAMPLE + maxj / 2) / maxj);
}
LOCAL(int)
-largest_input_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
+largest_input_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
/* Return largest input value that should map to j'th output value */
/* Must have largest(j=0) >= 0, and largest(j=maxj) >= MAXJSAMPLE */
{
/* Breakpoints are halfway between values returned by output_value */
- return (int) (((JLONG) (2*j + 1) * MAXJSAMPLE + maxj) / (2*maxj));
+ return (int)(((JLONG)(2 * j + 1) * MAXJSAMPLE + maxj) / (2 * maxj));
}
@@ -273,21 +274,21 @@ largest_input_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
*/
LOCAL(void)
-create_colormap (j_decompress_ptr cinfo)
+create_colormap(j_decompress_ptr cinfo)
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
JSAMPARRAY colormap; /* Created colormap */
int total_colors; /* Number of distinct output colors */
- int i,j,k, nci, blksize, blkdist, ptr, val;
+ int i, j, k, nci, blksize, blkdist, ptr, val;
/* Select number of colors for each component */
total_colors = select_ncolors(cinfo, cquantize->Ncolors);
/* Report selected color counts */
if (cinfo->out_color_components == 3)
- TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS,
- total_colors, cquantize->Ncolors[0],
- cquantize->Ncolors[1], cquantize->Ncolors[2]);
+ TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS, total_colors,
+ cquantize->Ncolors[0], cquantize->Ncolors[1],
+ cquantize->Ncolors[2]);
else
TRACEMS1(cinfo, 1, JTRC_QUANT_NCOLORS, total_colors);
@@ -296,8 +297,8 @@ create_colormap (j_decompress_ptr cinfo)
/* i.e. rightmost (highest-indexed) color changes most rapidly. */
colormap = (*cinfo->mem->alloc_sarray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (JDIMENSION) total_colors, (JDIMENSION) cinfo->out_color_components);
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)total_colors, (JDIMENSION)cinfo->out_color_components);
/* blksize is number of adjacent repeated entries for a component */
/* blkdist is distance between groups of identical entries for a component */
@@ -309,12 +310,12 @@ create_colormap (j_decompress_ptr cinfo)
blksize = blkdist / nci;
for (j = 0; j < nci; j++) {
/* Compute j'th output value (out of nci) for component */
- val = output_value(cinfo, i, j, nci-1);
+ val = output_value(cinfo, i, j, nci - 1);
/* Fill in all colormap entries that have this value of this component */
for (ptr = j * blksize; ptr < total_colors; ptr += blkdist) {
/* fill in blksize entries beginning at ptr */
for (k = 0; k < blksize; k++)
- colormap[i][ptr+k] = (JSAMPLE) val;
+ colormap[i][ptr + k] = (JSAMPLE)val;
}
}
blkdist = blksize; /* blksize of this color is blkdist of next */
@@ -333,11 +334,11 @@ create_colormap (j_decompress_ptr cinfo)
*/
LOCAL(void)
-create_colorindex (j_decompress_ptr cinfo)
+create_colorindex(j_decompress_ptr cinfo)
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
JSAMPROW indexptr;
- int i,j,k, nci, blksize, val, pad;
+ int i, j, k, nci, blksize, val, pad;
/* For ordered dither, we pad the color index tables by MAXJSAMPLE in
* each direction (input index values can be -MAXJSAMPLE .. 2*MAXJSAMPLE).
@@ -345,7 +346,7 @@ create_colorindex (j_decompress_ptr cinfo)
* flag whether it was done in case user changes dithering mode.
*/
if (cinfo->dither_mode == JDITHER_ORDERED) {
- pad = MAXJSAMPLE*2;
+ pad = MAXJSAMPLE * 2;
cquantize->is_padded = TRUE;
} else {
pad = 0;
@@ -353,9 +354,9 @@ create_colorindex (j_decompress_ptr cinfo)
}
cquantize->colorindex = (*cinfo->mem->alloc_sarray)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (JDIMENSION) (MAXJSAMPLE+1 + pad),
- (JDIMENSION) cinfo->out_color_components);
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)(MAXJSAMPLE + 1 + pad),
+ (JDIMENSION)cinfo->out_color_components);
/* blksize is number of adjacent repeated entries for a component */
blksize = cquantize->sv_actual;
@@ -373,18 +374,18 @@ create_colorindex (j_decompress_ptr cinfo)
/* and k = largest j that maps to current val */
indexptr = cquantize->colorindex[i];
val = 0;
- k = largest_input_value(cinfo, i, 0, nci-1);
+ k = largest_input_value(cinfo, i, 0, nci - 1);
for (j = 0; j <= MAXJSAMPLE; j++) {
while (j > k) /* advance val if past boundary */
- k = largest_input_value(cinfo, i, ++val, nci-1);
+ k = largest_input_value(cinfo, i, ++val, nci - 1);
/* premultiply so that no multiplication needed in main processing */
- indexptr[j] = (JSAMPLE) (val * blksize);
+ indexptr[j] = (JSAMPLE)(val * blksize);
}
/* Pad at both ends if necessary */
if (pad)
for (j = 1; j <= MAXJSAMPLE; j++) {
indexptr[-j] = indexptr[0];
- indexptr[MAXJSAMPLE+j] = indexptr[MAXJSAMPLE];
+ indexptr[MAXJSAMPLE + j] = indexptr[MAXJSAMPLE];
}
}
}
@@ -396,29 +397,29 @@ create_colorindex (j_decompress_ptr cinfo)
*/
LOCAL(ODITHER_MATRIX_PTR)
-make_odither_array (j_decompress_ptr cinfo, int ncolors)
+make_odither_array(j_decompress_ptr cinfo, int ncolors)
{
ODITHER_MATRIX_PTR odither;
- int j,k;
- JLONG num,den;
+ int j, k;
+ JLONG num, den;
odither = (ODITHER_MATRIX_PTR)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(ODITHER_MATRIX));
/* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1).
* Hence the dither value for the matrix cell with fill order f
* (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1).
* On 16-bit-int machine, be careful to avoid overflow.
*/
- den = 2 * ODITHER_CELLS * ((JLONG) (ncolors - 1));
+ den = 2 * ODITHER_CELLS * ((JLONG)(ncolors - 1));
for (j = 0; j < ODITHER_SIZE; j++) {
for (k = 0; k < ODITHER_SIZE; k++) {
- num = ((JLONG) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k])))
- * MAXJSAMPLE;
+ num = ((JLONG)(ODITHER_CELLS - 1 -
+ 2 * ((int)base_dither_matrix[j][k]))) * MAXJSAMPLE;
/* Ensure round towards zero despite C's lack of consistency
* about rounding negative values in integer division...
*/
- odither[j][k] = (int) (num<0 ? -((-num)/den) : num/den);
+ odither[j][k] = (int)(num < 0 ? -((-num) / den) : num / den);
}
}
return odither;
@@ -432,9 +433,9 @@ make_odither_array (j_decompress_ptr cinfo, int ncolors)
*/
LOCAL(void)
-create_odither_tables (j_decompress_ptr cinfo)
+create_odither_tables(j_decompress_ptr cinfo)
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
ODITHER_MATRIX_PTR odither;
int i, j, nci;
@@ -459,11 +460,11 @@ create_odither_tables (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPARRAY output_buf, int num_rows)
+color_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
/* General case, no dithering */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
JSAMPARRAY colorindex = cquantize->colorindex;
register int pixcode, ci;
register JSAMPROW ptrin, ptrout;
@@ -478,20 +479,20 @@ color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
for (col = width; col > 0; col--) {
pixcode = 0;
for (ci = 0; ci < nc; ci++) {
- pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
+ pixcode += colorindex[ci][*ptrin++];
}
- *ptrout++ = (JSAMPLE) pixcode;
+ *ptrout++ = (JSAMPLE)pixcode;
}
}
}
METHODDEF(void)
-color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPARRAY output_buf, int num_rows)
+color_quantize3(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
/* Fast path for out_color_components==3, no dithering */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
register int pixcode;
register JSAMPROW ptrin, ptrout;
JSAMPROW colorindex0 = cquantize->colorindex[0];
@@ -505,21 +506,21 @@ color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
ptrin = input_buf[row];
ptrout = output_buf[row];
for (col = width; col > 0; col--) {
- pixcode = GETJSAMPLE(colorindex0[GETJSAMPLE(*ptrin++)]);
- pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*ptrin++)]);
- pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*ptrin++)]);
- *ptrout++ = (JSAMPLE) pixcode;
+ pixcode = colorindex0[*ptrin++];
+ pixcode += colorindex1[*ptrin++];
+ pixcode += colorindex2[*ptrin++];
+ *ptrout++ = (JSAMPLE)pixcode;
}
}
}
METHODDEF(void)
-quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPARRAY output_buf, int num_rows)
+quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
/* General case, with ordered dithering */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
register JSAMPROW input_ptr;
register JSAMPROW output_ptr;
JSAMPROW colorindex_ci;
@@ -533,7 +534,7 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
for (row = 0; row < num_rows; row++) {
/* Initialize output values to 0 so can process components separately */
- jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
+ jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
row_index = cquantize->row_index;
for (ci = 0; ci < nc; ci++) {
input_ptr = input_buf[row] + ci;
@@ -550,7 +551,8 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
* inputs. The maximum dither is +- MAXJSAMPLE; this sets the
* required amount of padding.
*/
- *output_ptr += colorindex_ci[GETJSAMPLE(*input_ptr)+dither[col_index]];
+ *output_ptr +=
+ colorindex_ci[*input_ptr + dither[col_index]];
input_ptr += nc;
output_ptr++;
col_index = (col_index + 1) & ODITHER_MASK;
@@ -564,11 +566,11 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
METHODDEF(void)
-quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPARRAY output_buf, int num_rows)
+quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
/* Fast path for out_color_components==3, with ordered dithering */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
register int pixcode;
register JSAMPROW input_ptr;
register JSAMPROW output_ptr;
@@ -593,13 +595,10 @@ quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
col_index = 0;
for (col = width; col > 0; col--) {
- pixcode = GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) +
- dither0[col_index]]);
- pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) +
- dither1[col_index]]);
- pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) +
- dither2[col_index]]);
- *output_ptr++ = (JSAMPLE) pixcode;
+ pixcode = colorindex0[(*input_ptr++) + dither0[col_index]];
+ pixcode += colorindex1[(*input_ptr++) + dither1[col_index]];
+ pixcode += colorindex2[(*input_ptr++) + dither2[col_index]];
+ *output_ptr++ = (JSAMPLE)pixcode;
col_index = (col_index + 1) & ODITHER_MASK;
}
row_index = (row_index + 1) & ODITHER_MASK;
@@ -609,11 +608,11 @@ quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
METHODDEF(void)
-quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPARRAY output_buf, int num_rows)
+quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
/* General case, with Floyd-Steinberg dithering */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
register LOCFSERROR cur; /* current error or pixel value */
LOCFSERROR belowerr; /* error for pixel below cur */
LOCFSERROR bpreverr; /* error for below/prev col */
@@ -637,17 +636,17 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
for (row = 0; row < num_rows; row++) {
/* Initialize output values to 0 so can process components separately */
- jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
+ jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
for (ci = 0; ci < nc; ci++) {
input_ptr = input_buf[row] + ci;
output_ptr = output_buf[row];
if (cquantize->on_odd_row) {
/* work right to left in this row */
- input_ptr += (width-1) * nc; /* so point to rightmost pixel */
- output_ptr += width-1;
+ input_ptr += (width - 1) * nc; /* so point to rightmost pixel */
+ output_ptr += width - 1;
dir = -1;
dirnc = -nc;
- errorptr = cquantize->fserrors[ci] + (width+1); /* => entry after last column */
+ errorptr = cquantize->fserrors[ci] + (width + 1); /* => entry after last column */
} else {
/* work left to right in this row */
dir = 1;
@@ -675,15 +674,15 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
* The maximum error is +- MAXJSAMPLE; this sets the required size
* of the range_limit array.
*/
- cur += GETJSAMPLE(*input_ptr);
- cur = GETJSAMPLE(range_limit[cur]);
+ cur += *input_ptr;
+ cur = range_limit[cur];
/* Select output value, accumulate into output code for this pixel */
- pixcode = GETJSAMPLE(colorindex_ci[cur]);
- *output_ptr += (JSAMPLE) pixcode;
+ pixcode = colorindex_ci[cur];
+ *output_ptr += (JSAMPLE)pixcode;
/* Compute actual representation error at this pixel */
/* Note: we can do this even though we don't have the final */
/* pixel code, because the colormap is orthogonal. */
- cur -= GETJSAMPLE(colormap_ci[pixcode]);
+ cur -= colormap_ci[pixcode];
/* Compute error fractions to be propagated to adjacent pixels.
* Add these into the running sums, and simultaneously shift the
* next-line error sums left by 1 column.
@@ -691,7 +690,7 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
bnexterr = cur;
delta = cur * 2;
cur += delta; /* form error * 3 */
- errorptr[0] = (FSERROR) (bpreverr + cur);
+ errorptr[0] = (FSERROR)(bpreverr + cur);
cur += delta; /* form error * 5 */
bpreverr = belowerr + cur;
belowerr = bnexterr;
@@ -708,7 +707,7 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
* final fserrors[] entry. Note we need not unload belowerr because
* it is for the dummy column before or after the actual array.
*/
- errorptr[0] = (FSERROR) bpreverr; /* unload prev err into array */
+ errorptr[0] = (FSERROR)bpreverr; /* unload prev err into array */
}
cquantize->on_odd_row = (cquantize->on_odd_row ? FALSE : TRUE);
}
@@ -720,16 +719,16 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
*/
LOCAL(void)
-alloc_fs_workspace (j_decompress_ptr cinfo)
+alloc_fs_workspace(j_decompress_ptr cinfo)
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
size_t arraysize;
int i;
- arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
+ arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
for (i = 0; i < cinfo->out_color_components; i++) {
cquantize->fserrors[i] = (FSERRPTR)
- (*cinfo->mem->alloc_large)((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
}
}
@@ -739,9 +738,9 @@ alloc_fs_workspace (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
+start_pass_1_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
size_t arraysize;
int i;
@@ -767,7 +766,7 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
* we must recreate the color index table with padding.
* This will cost extra space, but probably isn't very likely.
*/
- if (! cquantize->is_padded)
+ if (!cquantize->is_padded)
create_colorindex(cinfo);
/* Create ordered-dither tables if we didn't already. */
if (cquantize->odither[0] == NULL)
@@ -780,9 +779,9 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
if (cquantize->fserrors[0] == NULL)
alloc_fs_workspace(cinfo);
/* Initialize the propagated errors to zero. */
- arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
+ arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
for (i = 0; i < cinfo->out_color_components; i++)
- jzero_far((void *) cquantize->fserrors[i], arraysize);
+ jzero_far((void *)cquantize->fserrors[i], arraysize);
break;
default:
ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -796,7 +795,7 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
*/
METHODDEF(void)
-finish_pass_1_quant (j_decompress_ptr cinfo)
+finish_pass_1_quant(j_decompress_ptr cinfo)
{
/* no work in 1-pass case */
}
@@ -808,7 +807,7 @@ finish_pass_1_quant (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-new_color_map_1_quant (j_decompress_ptr cinfo)
+new_color_map_1_quant(j_decompress_ptr cinfo)
{
ERREXIT(cinfo, JERR_MODE_CHANGE);
}
@@ -819,14 +818,14 @@ new_color_map_1_quant (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jinit_1pass_quantizer (j_decompress_ptr cinfo)
+jinit_1pass_quantizer(j_decompress_ptr cinfo)
{
my_cquantize_ptr cquantize;
cquantize = (my_cquantize_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_cquantizer));
- cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+ cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
cquantize->pub.start_pass = start_pass_1_quant;
cquantize->pub.finish_pass = finish_pass_1_quant;
cquantize->pub.new_color_map = new_color_map_1_quant;
@@ -837,8 +836,8 @@ jinit_1pass_quantizer (j_decompress_ptr cinfo)
if (cinfo->out_color_components > MAX_Q_COMPS)
ERREXIT1(cinfo, JERR_QUANT_COMPONENTS, MAX_Q_COMPS);
/* Make sure colormap indexes can be represented by JSAMPLEs */
- if (cinfo->desired_number_of_colors > (MAXJSAMPLE+1))
- ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE+1);
+ if (cinfo->desired_number_of_colors > (MAXJSAMPLE + 1))
+ ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE + 1);
/* Create the colormap and color index table. */
create_colormap(cinfo);
diff --git a/media/libjpeg/jquant2.c b/media/libjpeg/jquant2.c
index cfbd0f1526..44efb18cad 100644
--- a/media/libjpeg/jquant2.c
+++ b/media/libjpeg/jquant2.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1996, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2014-2015, 2020, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -73,14 +73,14 @@
* probably need to change these scale factors.
*/
-#define R_SCALE 2 /* scale R distances by this much */
-#define G_SCALE 3 /* scale G distances by this much */
-#define B_SCALE 1 /* and B by this much */
+#define R_SCALE 2 /* scale R distances by this much */
+#define G_SCALE 3 /* scale G distances by this much */
+#define B_SCALE 1 /* and B by this much */
-static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
-#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
-#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
-#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]
+static const int c_scales[3] = { R_SCALE, G_SCALE, B_SCALE };
+#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
+#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
+#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]
/*
* First we have the histogram data structure and routines for creating it.
@@ -106,7 +106,7 @@ static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
* each 2-D array has 2^6*2^5 = 2048 or 2^6*2^6 = 4096 entries.
*/
-#define MAXNUMCOLORS (MAXJSAMPLE+1) /* maximum size of colormap */
+#define MAXNUMCOLORS (MAXJSAMPLE + 1) /* maximum size of colormap */
/* These will do the right thing for either R,G,B or B,G,R color order,
* but you may not like the results for other color orders.
@@ -116,19 +116,19 @@ static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
#define HIST_C2_BITS 5 /* bits of precision in B/R histogram */
/* Number of elements along histogram axes. */
-#define HIST_C0_ELEMS (1<<HIST_C0_BITS)
-#define HIST_C1_ELEMS (1<<HIST_C1_BITS)
-#define HIST_C2_ELEMS (1<<HIST_C2_BITS)
+#define HIST_C0_ELEMS (1 << HIST_C0_BITS)
+#define HIST_C1_ELEMS (1 << HIST_C1_BITS)
+#define HIST_C2_ELEMS (1 << HIST_C2_BITS)
/* These are the amounts to shift an input value to get a histogram index. */
-#define C0_SHIFT (BITS_IN_JSAMPLE-HIST_C0_BITS)
-#define C1_SHIFT (BITS_IN_JSAMPLE-HIST_C1_BITS)
-#define C2_SHIFT (BITS_IN_JSAMPLE-HIST_C2_BITS)
+#define C0_SHIFT (BITS_IN_JSAMPLE - HIST_C0_BITS)
+#define C1_SHIFT (BITS_IN_JSAMPLE - HIST_C1_BITS)
+#define C2_SHIFT (BITS_IN_JSAMPLE - HIST_C2_BITS)
typedef UINT16 histcell; /* histogram cell; prefer an unsigned type */
-typedef histcell *histptr; /* for pointers to histogram cells */
+typedef histcell *histptr; /* for pointers to histogram cells */
typedef histcell hist1d[HIST_C2_ELEMS]; /* typedefs for the array */
typedef hist1d *hist2d; /* type for the 2nd-level pointers */
@@ -200,10 +200,10 @@ typedef my_cquantizer *my_cquantize_ptr;
*/
METHODDEF(void)
-prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPARRAY output_buf, int num_rows)
+prescan_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
register JSAMPROW ptr;
register histptr histp;
register hist3d histogram = cquantize->histogram;
@@ -215,9 +215,9 @@ prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
ptr = input_buf[row];
for (col = width; col > 0; col--) {
/* get pixel value and index into the histogram */
- histp = & histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
- [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
- [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
+ histp = &histogram[ptr[0] >> C0_SHIFT]
+ [ptr[1] >> C1_SHIFT]
+ [ptr[2] >> C2_SHIFT];
/* increment, check for overflow and undo increment if so. */
if (++(*histp) <= 0)
(*histp)--;
@@ -249,7 +249,7 @@ typedef box *boxptr;
LOCAL(boxptr)
-find_biggest_color_pop (boxptr boxlist, int numboxes)
+find_biggest_color_pop(boxptr boxlist, int numboxes)
/* Find the splittable box with the largest color population */
/* Returns NULL if no splittable boxes remain */
{
@@ -269,7 +269,7 @@ find_biggest_color_pop (boxptr boxlist, int numboxes)
LOCAL(boxptr)
-find_biggest_volume (boxptr boxlist, int numboxes)
+find_biggest_volume(boxptr boxlist, int numboxes)
/* Find the splittable box with the largest (scaled) volume */
/* Returns NULL if no splittable boxes remain */
{
@@ -289,16 +289,16 @@ find_biggest_volume (boxptr boxlist, int numboxes)
LOCAL(void)
-update_box (j_decompress_ptr cinfo, boxptr boxp)
+update_box(j_decompress_ptr cinfo, boxptr boxp)
/* Shrink the min/max bounds of a box to enclose only nonzero elements, */
/* and recompute its volume and population */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
hist3d histogram = cquantize->histogram;
histptr histp;
- int c0,c1,c2;
- int c0min,c0max,c1min,c1max,c2min,c2max;
- JLONG dist0,dist1,dist2;
+ int c0, c1, c2;
+ int c0min, c0max, c1min, c1max, c2min, c2max;
+ JLONG dist0, dist1, dist2;
long ccount;
c0min = boxp->c0min; c0max = boxp->c0max;
@@ -308,69 +308,69 @@ update_box (j_decompress_ptr cinfo, boxptr boxp)
if (c0max > c0min)
for (c0 = c0min; c0 <= c0max; c0++)
for (c1 = c1min; c1 <= c1max; c1++) {
- histp = & histogram[c0][c1][c2min];
+ histp = &histogram[c0][c1][c2min];
for (c2 = c2min; c2 <= c2max; c2++)
if (*histp++ != 0) {
boxp->c0min = c0min = c0;
goto have_c0min;
}
}
- have_c0min:
+have_c0min:
if (c0max > c0min)
for (c0 = c0max; c0 >= c0min; c0--)
for (c1 = c1min; c1 <= c1max; c1++) {
- histp = & histogram[c0][c1][c2min];
+ histp = &histogram[c0][c1][c2min];
for (c2 = c2min; c2 <= c2max; c2++)
if (*histp++ != 0) {
boxp->c0max = c0max = c0;
goto have_c0max;
}
}
- have_c0max:
+have_c0max:
if (c1max > c1min)
for (c1 = c1min; c1 <= c1max; c1++)
for (c0 = c0min; c0 <= c0max; c0++) {
- histp = & histogram[c0][c1][c2min];
+ histp = &histogram[c0][c1][c2min];
for (c2 = c2min; c2 <= c2max; c2++)
if (*histp++ != 0) {
boxp->c1min = c1min = c1;
goto have_c1min;
}
}
- have_c1min:
+have_c1min:
if (c1max > c1min)
for (c1 = c1max; c1 >= c1min; c1--)
for (c0 = c0min; c0 <= c0max; c0++) {
- histp = & histogram[c0][c1][c2min];
+ histp = &histogram[c0][c1][c2min];
for (c2 = c2min; c2 <= c2max; c2++)
if (*histp++ != 0) {
boxp->c1max = c1max = c1;
goto have_c1max;
}
}
- have_c1max:
+have_c1max:
if (c2max > c2min)
for (c2 = c2min; c2 <= c2max; c2++)
for (c0 = c0min; c0 <= c0max; c0++) {
- histp = & histogram[c0][c1min][c2];
+ histp = &histogram[c0][c1min][c2];
for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
if (*histp != 0) {
boxp->c2min = c2min = c2;
goto have_c2min;
}
}
- have_c2min:
+have_c2min:
if (c2max > c2min)
for (c2 = c2max; c2 >= c2min; c2--)
for (c0 = c0min; c0 <= c0max; c0++) {
- histp = & histogram[c0][c1min][c2];
+ histp = &histogram[c0][c1min][c2];
for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
if (*histp != 0) {
boxp->c2max = c2max = c2;
goto have_c2max;
}
}
- have_c2max:
+have_c2max:
/* Update box volume.
* We use 2-norm rather than real volume here; this biases the method
@@ -383,13 +383,13 @@ update_box (j_decompress_ptr cinfo, boxptr boxp)
dist0 = ((c0max - c0min) << C0_SHIFT) * C0_SCALE;
dist1 = ((c1max - c1min) << C1_SHIFT) * C1_SCALE;
dist2 = ((c2max - c2min) << C2_SHIFT) * C2_SCALE;
- boxp->volume = dist0*dist0 + dist1*dist1 + dist2*dist2;
+ boxp->volume = dist0 * dist0 + dist1 * dist1 + dist2 * dist2;
/* Now scan remaining volume of box and compute population */
ccount = 0;
for (c0 = c0min; c0 <= c0max; c0++)
for (c1 = c1min; c1 <= c1max; c1++) {
- histp = & histogram[c0][c1][c2min];
+ histp = &histogram[c0][c1][c2min];
for (c2 = c2min; c2 <= c2max; c2++, histp++)
if (*histp != 0) {
ccount++;
@@ -400,19 +400,19 @@ update_box (j_decompress_ptr cinfo, boxptr boxp)
LOCAL(int)
-median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
- int desired_colors)
+median_cut(j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
+ int desired_colors)
/* Repeatedly select and split the largest box until we have enough boxes */
{
- int n,lb;
- int c0,c1,c2,cmax;
- register boxptr b1,b2;
+ int n, lb;
+ int c0, c1, c2, cmax;
+ register boxptr b1, b2;
while (numboxes < desired_colors) {
/* Select box to split.
* Current algorithm: by population for first half, then by volume.
*/
- if (numboxes*2 <= desired_colors) {
+ if (numboxes * 2 <= desired_colors) {
b1 = find_biggest_color_pop(boxlist, numboxes);
} else {
b1 = find_biggest_volume(boxlist, numboxes);
@@ -421,8 +421,8 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
break;
b2 = &boxlist[numboxes]; /* where new box will go */
/* Copy the color bounds to the new box. */
- b2->c0max = b1->c0max; b2->c1max = b1->c1max; b2->c2max = b1->c2max;
- b2->c0min = b1->c0min; b2->c1min = b1->c1min; b2->c2min = b1->c2min;
+ b2->c0max = b1->c0max; b2->c1max = b1->c1max; b2->c2max = b1->c2max;
+ b2->c0min = b1->c0min; b2->c1min = b1->c1min; b2->c2min = b1->c2min;
/* Choose which axis to split the box on.
* Current algorithm: longest scaled axis.
* See notes in update_box about scaling distances.
@@ -434,13 +434,12 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
* This code does the right thing for R,G,B or B,G,R color orders only.
*/
if (rgb_red[cinfo->out_color_space] == 0) {
- cmax = c1; n = 1;
- if (c0 > cmax) { cmax = c0; n = 0; }
+ cmax = c1; n = 1;
+ if (c0 > cmax) { cmax = c0; n = 0; }
if (c2 > cmax) { n = 2; }
- }
- else {
- cmax = c1; n = 1;
- if (c2 > cmax) { cmax = c2; n = 2; }
+ } else {
+ cmax = c1; n = 1;
+ if (c2 > cmax) { cmax = c2; n = 2; }
if (c0 > cmax) { n = 0; }
}
/* Choose split point along selected axis, and update box bounds.
@@ -453,17 +452,17 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
case 0:
lb = (b1->c0max + b1->c0min) / 2;
b1->c0max = lb;
- b2->c0min = lb+1;
+ b2->c0min = lb + 1;
break;
case 1:
lb = (b1->c1max + b1->c1min) / 2;
b1->c1max = lb;
- b2->c1min = lb+1;
+ b2->c1min = lb + 1;
break;
case 2:
lb = (b1->c2max + b1->c2min) / 2;
b1->c2max = lb;
- b2->c2min = lb+1;
+ b2->c2min = lb + 1;
break;
}
/* Update stats for boxes */
@@ -476,16 +475,16 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
LOCAL(void)
-compute_color (j_decompress_ptr cinfo, boxptr boxp, int icolor)
+compute_color(j_decompress_ptr cinfo, boxptr boxp, int icolor)
/* Compute representative color for a box, put it in colormap[icolor] */
{
/* Current algorithm: mean weighted by pixels (not colors) */
/* Note it is important to get the rounding correct! */
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
hist3d histogram = cquantize->histogram;
histptr histp;
- int c0,c1,c2;
- int c0min,c0max,c1min,c1max,c2min,c2max;
+ int c0, c1, c2;
+ int c0min, c0max, c1min, c1max, c2min, c2max;
long count;
long total = 0;
long c0total = 0;
@@ -498,25 +497,25 @@ compute_color (j_decompress_ptr cinfo, boxptr boxp, int icolor)
for (c0 = c0min; c0 <= c0max; c0++)
for (c1 = c1min; c1 <= c1max; c1++) {
- histp = & histogram[c0][c1][c2min];
+ histp = &histogram[c0][c1][c2min];
for (c2 = c2min; c2 <= c2max; c2++) {
if ((count = *histp++) != 0) {
total += count;
- c0total += ((c0 << C0_SHIFT) + ((1<<C0_SHIFT)>>1)) * count;
- c1total += ((c1 << C1_SHIFT) + ((1<<C1_SHIFT)>>1)) * count;
- c2total += ((c2 << C2_SHIFT) + ((1<<C2_SHIFT)>>1)) * count;
+ c0total += ((c0 << C0_SHIFT) + ((1 << C0_SHIFT) >> 1)) * count;
+ c1total += ((c1 << C1_SHIFT) + ((1 << C1_SHIFT) >> 1)) * count;
+ c2total += ((c2 << C2_SHIFT) + ((1 << C2_SHIFT) >> 1)) * count;
}
}
}
- cinfo->colormap[0][icolor] = (JSAMPLE) ((c0total + (total>>1)) / total);
- cinfo->colormap[1][icolor] = (JSAMPLE) ((c1total + (total>>1)) / total);
- cinfo->colormap[2][icolor] = (JSAMPLE) ((c2total + (total>>1)) / total);
+ cinfo->colormap[0][icolor] = (JSAMPLE)((c0total + (total >> 1)) / total);
+ cinfo->colormap[1][icolor] = (JSAMPLE)((c1total + (total >> 1)) / total);
+ cinfo->colormap[2][icolor] = (JSAMPLE)((c2total + (total >> 1)) / total);
}
LOCAL(void)
-select_colors (j_decompress_ptr cinfo, int desired_colors)
+select_colors(j_decompress_ptr cinfo, int desired_colors)
/* Master routine for color selection */
{
boxptr boxlist;
@@ -524,8 +523,8 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
int i;
/* Allocate workspace for box list */
- boxlist = (boxptr) (*cinfo->mem->alloc_small)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
+ boxlist = (boxptr)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
/* Initialize one box containing whole space */
numboxes = 1;
boxlist[0].c0min = 0;
@@ -535,12 +534,12 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
boxlist[0].c2min = 0;
boxlist[0].c2max = MAXJSAMPLE >> C2_SHIFT;
/* Shrink it to actually-used volume and set its statistics */
- update_box(cinfo, & boxlist[0]);
+ update_box(cinfo, &boxlist[0]);
/* Perform median-cut to produce final box list */
numboxes = median_cut(cinfo, boxlist, numboxes, desired_colors);
/* Compute the representative color for each box, fill colormap */
for (i = 0; i < numboxes; i++)
- compute_color(cinfo, & boxlist[i], i);
+ compute_color(cinfo, &boxlist[i], i);
cinfo->actual_number_of_colors = numboxes;
TRACEMS1(cinfo, 1, JTRC_QUANT_SELECTED, numboxes);
}
@@ -601,13 +600,13 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
/* log2(histogram cells in update box) for each axis; this can be adjusted */
-#define BOX_C0_LOG (HIST_C0_BITS-3)
-#define BOX_C1_LOG (HIST_C1_BITS-3)
-#define BOX_C2_LOG (HIST_C2_BITS-3)
+#define BOX_C0_LOG (HIST_C0_BITS - 3)
+#define BOX_C1_LOG (HIST_C1_BITS - 3)
+#define BOX_C2_LOG (HIST_C2_BITS - 3)
-#define BOX_C0_ELEMS (1<<BOX_C0_LOG) /* # of hist cells in update box */
-#define BOX_C1_ELEMS (1<<BOX_C1_LOG)
-#define BOX_C2_ELEMS (1<<BOX_C2_LOG)
+#define BOX_C0_ELEMS (1 << BOX_C0_LOG) /* # of hist cells in update box */
+#define BOX_C1_ELEMS (1 << BOX_C1_LOG)
+#define BOX_C2_ELEMS (1 << BOX_C2_LOG)
#define BOX_C0_SHIFT (C0_SHIFT + BOX_C0_LOG)
#define BOX_C1_SHIFT (C1_SHIFT + BOX_C1_LOG)
@@ -623,8 +622,8 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
*/
LOCAL(int)
-find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
- JSAMPLE colorlist[])
+find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+ JSAMPLE colorlist[])
/* Locate the colormap entries close enough to an update box to be candidates
* for the nearest entry to some cell(s) in the update box. The update box
* is specified by the center coordinates of its first cell. The number of
@@ -666,70 +665,70 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
for (i = 0; i < numcolors; i++) {
/* We compute the squared-c0-distance term, then add in the other two. */
- x = GETJSAMPLE(cinfo->colormap[0][i]);
+ x = cinfo->colormap[0][i];
if (x < minc0) {
tdist = (x - minc0) * C0_SCALE;
- min_dist = tdist*tdist;
+ min_dist = tdist * tdist;
tdist = (x - maxc0) * C0_SCALE;
- max_dist = tdist*tdist;
+ max_dist = tdist * tdist;
} else if (x > maxc0) {
tdist = (x - maxc0) * C0_SCALE;
- min_dist = tdist*tdist;
+ min_dist = tdist * tdist;
tdist = (x - minc0) * C0_SCALE;
- max_dist = tdist*tdist;
+ max_dist = tdist * tdist;
} else {
/* within cell range so no contribution to min_dist */
min_dist = 0;
if (x <= centerc0) {
tdist = (x - maxc0) * C0_SCALE;
- max_dist = tdist*tdist;
+ max_dist = tdist * tdist;
} else {
tdist = (x - minc0) * C0_SCALE;
- max_dist = tdist*tdist;
+ max_dist = tdist * tdist;
}
}
- x = GETJSAMPLE(cinfo->colormap[1][i]);
+ x = cinfo->colormap[1][i];
if (x < minc1) {
tdist = (x - minc1) * C1_SCALE;
- min_dist += tdist*tdist;
+ min_dist += tdist * tdist;
tdist = (x - maxc1) * C1_SCALE;
- max_dist += tdist*tdist;
+ max_dist += tdist * tdist;
} else if (x > maxc1) {
tdist = (x - maxc1) * C1_SCALE;
- min_dist += tdist*tdist;
+ min_dist += tdist * tdist;
tdist = (x - minc1) * C1_SCALE;
- max_dist += tdist*tdist;
+ max_dist += tdist * tdist;
} else {
/* within cell range so no contribution to min_dist */
if (x <= centerc1) {
tdist = (x - maxc1) * C1_SCALE;
- max_dist += tdist*tdist;
+ max_dist += tdist * tdist;
} else {
tdist = (x - minc1) * C1_SCALE;
- max_dist += tdist*tdist;
+ max_dist += tdist * tdist;
}
}
- x = GETJSAMPLE(cinfo->colormap[2][i]);
+ x = cinfo->colormap[2][i];
if (x < minc2) {
tdist = (x - minc2) * C2_SCALE;
- min_dist += tdist*tdist;
+ min_dist += tdist * tdist;
tdist = (x - maxc2) * C2_SCALE;
- max_dist += tdist*tdist;
+ max_dist += tdist * tdist;
} else if (x > maxc2) {
tdist = (x - maxc2) * C2_SCALE;
- min_dist += tdist*tdist;
+ min_dist += tdist * tdist;
tdist = (x - minc2) * C2_SCALE;
- max_dist += tdist*tdist;
+ max_dist += tdist * tdist;
} else {
/* within cell range so no contribution to min_dist */
if (x <= centerc2) {
tdist = (x - maxc2) * C2_SCALE;
- max_dist += tdist*tdist;
+ max_dist += tdist * tdist;
} else {
tdist = (x - minc2) * C2_SCALE;
- max_dist += tdist*tdist;
+ max_dist += tdist * tdist;
}
}
@@ -745,15 +744,15 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
ncolors = 0;
for (i = 0; i < numcolors; i++) {
if (mindist[i] <= minmaxdist)
- colorlist[ncolors++] = (JSAMPLE) i;
+ colorlist[ncolors++] = (JSAMPLE)i;
}
return ncolors;
}
LOCAL(void)
-find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
- int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
+find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+ int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
/* Find the closest colormap entry for each cell in the update box,
* given the list of candidate colors prepared by find_nearby_colors.
* Return the indexes of the closest entries in the bestcolor[] array.
@@ -775,7 +774,7 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
/* Initialize best-distance for each cell of the update box */
bptr = bestdist;
- for (i = BOX_C0_ELEMS*BOX_C1_ELEMS*BOX_C2_ELEMS-1; i >= 0; i--)
+ for (i = BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS - 1; i >= 0; i--)
*bptr++ = 0x7FFFFFFFL;
/* For each color selected by find_nearby_colors,
@@ -789,14 +788,14 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
#define STEP_C2 ((1 << C2_SHIFT) * C2_SCALE)
for (i = 0; i < numcolors; i++) {
- icolor = GETJSAMPLE(colorlist[i]);
+ icolor = colorlist[i];
/* Compute (square of) distance from minc0/c1/c2 to this color */
- inc0 = (minc0 - GETJSAMPLE(cinfo->colormap[0][icolor])) * C0_SCALE;
- dist0 = inc0*inc0;
- inc1 = (minc1 - GETJSAMPLE(cinfo->colormap[1][icolor])) * C1_SCALE;
- dist0 += inc1*inc1;
- inc2 = (minc2 - GETJSAMPLE(cinfo->colormap[2][icolor])) * C2_SCALE;
- dist0 += inc2*inc2;
+ inc0 = (minc0 - cinfo->colormap[0][icolor]) * C0_SCALE;
+ dist0 = inc0 * inc0;
+ inc1 = (minc1 - cinfo->colormap[1][icolor]) * C1_SCALE;
+ dist0 += inc1 * inc1;
+ inc2 = (minc2 - cinfo->colormap[2][icolor]) * C2_SCALE;
+ dist0 += inc2 * inc2;
/* Form the initial difference increments */
inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
inc1 = inc1 * (2 * STEP_C1) + STEP_C1 * STEP_C1;
@@ -805,16 +804,16 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
bptr = bestdist;
cptr = bestcolor;
xx0 = inc0;
- for (ic0 = BOX_C0_ELEMS-1; ic0 >= 0; ic0--) {
+ for (ic0 = BOX_C0_ELEMS - 1; ic0 >= 0; ic0--) {
dist1 = dist0;
xx1 = inc1;
- for (ic1 = BOX_C1_ELEMS-1; ic1 >= 0; ic1--) {
+ for (ic1 = BOX_C1_ELEMS - 1; ic1 >= 0; ic1--) {
dist2 = dist1;
xx2 = inc2;
- for (ic2 = BOX_C2_ELEMS-1; ic2 >= 0; ic2--) {
+ for (ic2 = BOX_C2_ELEMS - 1; ic2 >= 0; ic2--) {
if (dist2 < *bptr) {
*bptr = dist2;
- *cptr = (JSAMPLE) icolor;
+ *cptr = (JSAMPLE)icolor;
}
dist2 += xx2;
xx2 += 2 * STEP_C2 * STEP_C2;
@@ -832,12 +831,12 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
LOCAL(void)
-fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
+fill_inverse_cmap(j_decompress_ptr cinfo, int c0, int c1, int c2)
/* Fill the inverse-colormap entries in the update box that contains */
/* histogram cell c0/c1/c2. (Only that one cell MUST be filled, but */
/* we can fill as many others as we wish.) */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
hist3d histogram = cquantize->histogram;
int minc0, minc1, minc2; /* lower left corner of update box */
int ic0, ic1, ic2;
@@ -878,9 +877,9 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
cptr = bestcolor;
for (ic0 = 0; ic0 < BOX_C0_ELEMS; ic0++) {
for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
- cachep = & histogram[c0+ic0][c1+ic1][c2];
+ cachep = &histogram[c0 + ic0][c1 + ic1][c2];
for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
- *cachep++ = (histcell) (GETJSAMPLE(*cptr++) + 1);
+ *cachep++ = (histcell)((*cptr++) + 1);
}
}
}
@@ -892,11 +891,11 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
*/
METHODDEF(void)
-pass2_no_dither (j_decompress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+pass2_no_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
/* This version performs no dithering */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
hist3d histogram = cquantize->histogram;
register JSAMPROW inptr, outptr;
register histptr cachep;
@@ -910,27 +909,27 @@ pass2_no_dither (j_decompress_ptr cinfo,
outptr = output_buf[row];
for (col = width; col > 0; col--) {
/* get pixel value and index into the cache */
- c0 = GETJSAMPLE(*inptr++) >> C0_SHIFT;
- c1 = GETJSAMPLE(*inptr++) >> C1_SHIFT;
- c2 = GETJSAMPLE(*inptr++) >> C2_SHIFT;
- cachep = & histogram[c0][c1][c2];
+ c0 = (*inptr++) >> C0_SHIFT;
+ c1 = (*inptr++) >> C1_SHIFT;
+ c2 = (*inptr++) >> C2_SHIFT;
+ cachep = &histogram[c0][c1][c2];
/* If we have not seen this color before, find nearest colormap entry */
/* and update the cache */
if (*cachep == 0)
- fill_inverse_cmap(cinfo, c0,c1,c2);
+ fill_inverse_cmap(cinfo, c0, c1, c2);
/* Now emit the colormap index for this cell */
- *outptr++ = (JSAMPLE) (*cachep - 1);
+ *outptr++ = (JSAMPLE)(*cachep - 1);
}
}
}
METHODDEF(void)
-pass2_fs_dither (j_decompress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
/* This version performs Floyd-Steinberg dithering */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
hist3d histogram = cquantize->histogram;
register LOCFSERROR cur0, cur1, cur2; /* current error or pixel value */
LOCFSERROR belowerr0, belowerr1, belowerr2; /* error for pixel below cur */
@@ -956,11 +955,11 @@ pass2_fs_dither (j_decompress_ptr cinfo,
outptr = output_buf[row];
if (cquantize->on_odd_row) {
/* work right to left in this row */
- inptr += (width-1) * 3; /* so point to rightmost pixel */
- outptr += width-1;
+ inptr += (width - 1) * 3; /* so point to rightmost pixel */
+ outptr += width - 1;
dir = -1;
dir3 = -3;
- errorptr = cquantize->fserrors + (width+1)*3; /* => entry after last column */
+ errorptr = cquantize->fserrors + (width + 1) * 3; /* => entry after last column */
cquantize->on_odd_row = FALSE; /* flip for next time */
} else {
/* work left to right in this row */
@@ -984,9 +983,9 @@ pass2_fs_dither (j_decompress_ptr cinfo,
* for either sign of the error value.
* Note: errorptr points to *previous* column's array entry.
*/
- cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3+0] + 8, 4);
- cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3+1] + 8, 4);
- cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3+2] + 8, 4);
+ cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3 + 0] + 8, 4);
+ cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3 + 1] + 8, 4);
+ cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3 + 2] + 8, 4);
/* Limit the error using transfer function set by init_error_limit.
* See comments with init_error_limit for rationale.
*/
@@ -997,44 +996,48 @@ pass2_fs_dither (j_decompress_ptr cinfo,
* The maximum error is +- MAXJSAMPLE (or less with error limiting);
* this sets the required size of the range_limit array.
*/
- cur0 += GETJSAMPLE(inptr[0]);
- cur1 += GETJSAMPLE(inptr[1]);
- cur2 += GETJSAMPLE(inptr[2]);
- cur0 = GETJSAMPLE(range_limit[cur0]);
- cur1 = GETJSAMPLE(range_limit[cur1]);
- cur2 = GETJSAMPLE(range_limit[cur2]);
+ cur0 += inptr[0];
+ cur1 += inptr[1];
+ cur2 += inptr[2];
+ cur0 = range_limit[cur0];
+ cur1 = range_limit[cur1];
+ cur2 = range_limit[cur2];
/* Index into the cache with adjusted pixel value */
- cachep = & histogram[cur0>>C0_SHIFT][cur1>>C1_SHIFT][cur2>>C2_SHIFT];
+ cachep =
+ &histogram[cur0 >> C0_SHIFT][cur1 >> C1_SHIFT][cur2 >> C2_SHIFT];
/* If we have not seen this color before, find nearest colormap */
/* entry and update the cache */
if (*cachep == 0)
- fill_inverse_cmap(cinfo, cur0>>C0_SHIFT,cur1>>C1_SHIFT,cur2>>C2_SHIFT);
+ fill_inverse_cmap(cinfo, cur0 >> C0_SHIFT, cur1 >> C1_SHIFT,
+ cur2 >> C2_SHIFT);
/* Now emit the colormap index for this cell */
- { register int pixcode = *cachep - 1;
- *outptr = (JSAMPLE) pixcode;
+ {
+ register int pixcode = *cachep - 1;
+ *outptr = (JSAMPLE)pixcode;
/* Compute representation error for this pixel */
- cur0 -= GETJSAMPLE(colormap0[pixcode]);
- cur1 -= GETJSAMPLE(colormap1[pixcode]);
- cur2 -= GETJSAMPLE(colormap2[pixcode]);
+ cur0 -= colormap0[pixcode];
+ cur1 -= colormap1[pixcode];
+ cur2 -= colormap2[pixcode];
}
/* Compute error fractions to be propagated to adjacent pixels.
* Add these into the running sums, and simultaneously shift the
* next-line error sums left by 1 column.
*/
- { register LOCFSERROR bnexterr;
+ {
+ register LOCFSERROR bnexterr;
bnexterr = cur0; /* Process component 0 */
- errorptr[0] = (FSERROR) (bpreverr0 + cur0 * 3);
+ errorptr[0] = (FSERROR)(bpreverr0 + cur0 * 3);
bpreverr0 = belowerr0 + cur0 * 5;
belowerr0 = bnexterr;
cur0 *= 7;
bnexterr = cur1; /* Process component 1 */
- errorptr[1] = (FSERROR) (bpreverr1 + cur1 * 3);
+ errorptr[1] = (FSERROR)(bpreverr1 + cur1 * 3);
bpreverr1 = belowerr1 + cur1 * 5;
belowerr1 = bnexterr;
cur1 *= 7;
bnexterr = cur2; /* Process component 2 */
- errorptr[2] = (FSERROR) (bpreverr2 + cur2 * 3);
+ errorptr[2] = (FSERROR)(bpreverr2 + cur2 * 3);
bpreverr2 = belowerr2 + cur2 * 5;
belowerr2 = bnexterr;
cur2 *= 7;
@@ -1051,9 +1054,9 @@ pass2_fs_dither (j_decompress_ptr cinfo,
* final fserrors[] entry. Note we need not unload belowerrN because
* it is for the dummy column before or after the actual array.
*/
- errorptr[0] = (FSERROR) bpreverr0; /* unload prev errs into array */
- errorptr[1] = (FSERROR) bpreverr1;
- errorptr[2] = (FSERROR) bpreverr2;
+ errorptr[0] = (FSERROR)bpreverr0; /* unload prev errs into array */
+ errorptr[1] = (FSERROR)bpreverr1;
+ errorptr[2] = (FSERROR)bpreverr2;
}
}
@@ -1076,31 +1079,31 @@ pass2_fs_dither (j_decompress_ptr cinfo,
*/
LOCAL(void)
-init_error_limit (j_decompress_ptr cinfo)
+init_error_limit(j_decompress_ptr cinfo)
/* Allocate and fill in the error_limiter table */
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
int *table;
int in, out;
- table = (int *) (*cinfo->mem->alloc_small)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE*2+1) * sizeof(int));
+ table = (int *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, (MAXJSAMPLE * 2 + 1) * sizeof(int));
table += MAXJSAMPLE; /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */
cquantize->error_limiter = table;
-#define STEPSIZE ((MAXJSAMPLE+1)/16)
+#define STEPSIZE ((MAXJSAMPLE + 1) / 16)
/* Map errors 1:1 up to +- MAXJSAMPLE/16 */
out = 0;
for (in = 0; in < STEPSIZE; in++, out++) {
- table[in] = out; table[-in] = -out;
+ table[in] = out; table[-in] = -out;
}
/* Map errors 1:2 up to +- 3*MAXJSAMPLE/16 */
- for (; in < STEPSIZE*3; in++, out += (in&1) ? 0 : 1) {
- table[in] = out; table[-in] = -out;
+ for (; in < STEPSIZE * 3; in++, out += (in & 1) ? 0 : 1) {
+ table[in] = out; table[-in] = -out;
}
/* Clamp the rest to final out value (which is (MAXJSAMPLE+1)/8) */
for (; in <= MAXJSAMPLE; in++) {
- table[in] = out; table[-in] = -out;
+ table[in] = out; table[-in] = -out;
}
#undef STEPSIZE
}
@@ -1111,9 +1114,9 @@ init_error_limit (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-finish_pass1 (j_decompress_ptr cinfo)
+finish_pass1(j_decompress_ptr cinfo)
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
/* Select the representative colors and fill in cinfo->colormap */
cinfo->colormap = cquantize->sv_colormap;
@@ -1124,7 +1127,7 @@ finish_pass1 (j_decompress_ptr cinfo)
METHODDEF(void)
-finish_pass2 (j_decompress_ptr cinfo)
+finish_pass2(j_decompress_ptr cinfo)
{
/* no work */
}
@@ -1135,14 +1138,14 @@ finish_pass2 (j_decompress_ptr cinfo)
*/
METHODDEF(void)
-start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
+start_pass_2_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
hist3d histogram = cquantize->histogram;
int i;
/* Only F-S dithering or no dithering is supported. */
- /* If user asks for ordered dither, give him F-S. */
+ /* If user asks for ordered dither, give them F-S. */
if (cinfo->dither_mode != JDITHER_NONE)
cinfo->dither_mode = JDITHER_FS;
@@ -1167,14 +1170,14 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
if (cinfo->dither_mode == JDITHER_FS) {
- size_t arraysize = (size_t) ((cinfo->output_width + 2) *
- (3 * sizeof(FSERROR)));
+ size_t arraysize =
+ (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR)));
/* Allocate Floyd-Steinberg workspace if we didn't already. */
if (cquantize->fserrors == NULL)
- cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+ cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
/* Initialize the propagated errors to zero. */
- jzero_far((void *) cquantize->fserrors, arraysize);
+ jzero_far((void *)cquantize->fserrors, arraysize);
/* Make the error-limit table if we didn't already. */
if (cquantize->error_limiter == NULL)
init_error_limit(cinfo);
@@ -1185,8 +1188,8 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
/* Zero the histogram or inverse color map, if necessary */
if (cquantize->needs_zeroed) {
for (i = 0; i < HIST_C0_ELEMS; i++) {
- jzero_far((void *) histogram[i],
- HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
+ jzero_far((void *)histogram[i],
+ HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
}
cquantize->needs_zeroed = FALSE;
}
@@ -1198,9 +1201,9 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
*/
METHODDEF(void)
-new_color_map_2_quant (j_decompress_ptr cinfo)
+new_color_map_2_quant(j_decompress_ptr cinfo)
{
- my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
/* Reset the inverse color map */
cquantize->needs_zeroed = TRUE;
@@ -1212,15 +1215,15 @@ new_color_map_2_quant (j_decompress_ptr cinfo)
*/
GLOBAL(void)
-jinit_2pass_quantizer (j_decompress_ptr cinfo)
+jinit_2pass_quantizer(j_decompress_ptr cinfo)
{
my_cquantize_ptr cquantize;
int i;
cquantize = (my_cquantize_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(my_cquantizer));
- cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+ cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
cquantize->pub.start_pass = start_pass_2_quant;
cquantize->pub.new_color_map = new_color_map_2_quant;
cquantize->fserrors = NULL; /* flag optional arrays not allocated */
@@ -1231,12 +1234,12 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
ERREXIT(cinfo, JERR_NOTIMPL);
/* Allocate the histogram/inverse colormap storage */
- cquantize->histogram = (hist3d) (*cinfo->mem->alloc_small)
- ((j_common_ptr) cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
+ cquantize->histogram = (hist3d)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
for (i = 0; i < HIST_C0_ELEMS; i++) {
- cquantize->histogram[i] = (hist2d) (*cinfo->mem->alloc_large)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
- HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
+ cquantize->histogram[i] = (hist2d)(*cinfo->mem->alloc_large)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
}
cquantize->needs_zeroed = TRUE; /* histogram is garbage now */
@@ -1254,13 +1257,13 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
if (desired > MAXNUMCOLORS)
ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
cquantize->sv_colormap = (*cinfo->mem->alloc_sarray)
- ((j_common_ptr) cinfo,JPOOL_IMAGE, (JDIMENSION) desired, (JDIMENSION) 3);
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)desired, (JDIMENSION)3);
cquantize->desired = desired;
} else
cquantize->sv_colormap = NULL;
/* Only F-S dithering or no dithering is supported. */
- /* If user asks for ordered dither, give him F-S. */
+ /* If user asks for ordered dither, give them F-S. */
if (cinfo->dither_mode != JDITHER_NONE)
cinfo->dither_mode = JDITHER_FS;
@@ -1271,9 +1274,9 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
* dither_mode changes.
*/
if (cinfo->dither_mode == JDITHER_FS) {
- cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
- ((j_common_ptr) cinfo, JPOOL_IMAGE,
- (size_t) ((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
+ cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
/* Might as well create the error-limiting table too. */
init_error_limit(cinfo);
}
diff --git a/media/libjpeg/jsimd.h b/media/libjpeg/jsimd.h
index 3aa0779b8a..6c203655ef 100644
--- a/media/libjpeg/jsimd.h
+++ b/media/libjpeg/jsimd.h
@@ -3,7 +3,8 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, 2014, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -13,81 +14,110 @@
#include "jchuff.h" /* Declarations shared with jcphuff.c */
-EXTERN(int) jsimd_can_rgb_ycc (void);
-EXTERN(int) jsimd_can_rgb_gray (void);
-EXTERN(int) jsimd_can_ycc_rgb (void);
-EXTERN(int) jsimd_can_ycc_rgb565 (void);
-EXTERN(int) jsimd_c_can_null_convert (void);
-
-EXTERN(void) jsimd_rgb_ycc_convert
- (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_rgb_gray_convert
- (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_ycc_rgb_convert
- (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_rgb565_convert
- (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_c_null_convert
- (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-
-EXTERN(int) jsimd_can_h2v2_downsample (void);
-EXTERN(int) jsimd_can_h2v1_downsample (void);
-
-EXTERN(void) jsimd_h2v2_downsample
- (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
-
-EXTERN(int) jsimd_can_h2v2_smooth_downsample (void);
-
-EXTERN(void) jsimd_h2v2_smooth_downsample
- (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
-
-EXTERN(void) jsimd_h2v1_downsample
- (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
-
-EXTERN(int) jsimd_can_h2v2_upsample (void);
-EXTERN(int) jsimd_can_h2v1_upsample (void);
-EXTERN(int) jsimd_can_int_upsample (void);
-
-EXTERN(void) jsimd_h2v2_upsample
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_upsample
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_int_upsample
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-
-EXTERN(int) jsimd_can_h2v2_fancy_upsample (void);
-EXTERN(int) jsimd_can_h2v1_fancy_upsample (void);
-
-EXTERN(void) jsimd_h2v2_fancy_upsample
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_fancy_upsample
- (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-
-EXTERN(int) jsimd_can_h2v2_merged_upsample (void);
-EXTERN(int) jsimd_can_h2v1_merged_upsample (void);
-
-EXTERN(void) jsimd_h2v2_merged_upsample
- (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-EXTERN(void) jsimd_h2v1_merged_upsample
- (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-
-EXTERN(int) jsimd_can_huff_encode_one_block (void);
-
-EXTERN(JOCTET*) jsimd_huff_encode_one_block
- (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
- c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(int) jsimd_can_rgb_ycc(void);
+EXTERN(int) jsimd_can_rgb_gray(void);
+EXTERN(int) jsimd_can_ycc_rgb(void);
+EXTERN(int) jsimd_can_ycc_rgb565(void);
+EXTERN(int) jsimd_c_can_null_convert(void);
+
+EXTERN(void) jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_ycc_rgb_convert(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows);
+
+EXTERN(int) jsimd_can_h2v2_downsample(void);
+EXTERN(int) jsimd_can_h2v1_downsample(void);
+
+EXTERN(void) jsimd_h2v2_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data);
+
+EXTERN(int) jsimd_can_h2v2_smooth_downsample(void);
+
+EXTERN(void) jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data);
+
+EXTERN(int) jsimd_can_h2v2_upsample(void);
+EXTERN(int) jsimd_can_h2v1_upsample(void);
+EXTERN(int) jsimd_can_int_upsample(void);
+
+EXTERN(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_int_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
+EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
+EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
+
+EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
+EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
+
+EXTERN(void) jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(int) jsimd_can_huff_encode_one_block(void);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl);
+
+EXTERN(int) jsimd_can_encode_mcu_AC_first_prepare(void);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
diff --git a/media/libjpeg/jsimd_none.c b/media/libjpeg/jsimd_none.c
index f29030cfa7..5b38a9fb5c 100644
--- a/media/libjpeg/jsimd_none.c
+++ b/media/libjpeg/jsimd_none.c
@@ -3,7 +3,8 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,385 +21,411 @@
#include "jsimddct.h"
GLOBAL(int)
-jsimd_can_rgb_ycc (void)
+jsimd_can_rgb_ycc(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_rgb_gray (void)
+jsimd_can_rgb_gray(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_ycc_rgb (void)
+jsimd_can_ycc_rgb(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
+jsimd_can_ycc_rgb565(void)
{
return 0;
}
GLOBAL(int)
-jsimd_c_can_null_convert (void)
+jsimd_c_can_null_convert(void)
{
return 0;
}
GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
{
}
GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
{
}
GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
}
GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
}
GLOBAL(void)
-jsimd_c_null_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
{
}
GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
+jsimd_can_h2v2_downsample(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
+jsimd_can_h2v1_downsample(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample (void)
+jsimd_can_h2v2_smooth_downsample(void)
{
return 0;
}
GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
}
GLOBAL(void)
-jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
}
GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
}
GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
+jsimd_can_h2v2_upsample(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
+jsimd_can_h2v1_upsample(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_int_upsample (void)
+jsimd_can_int_upsample(void)
{
return 0;
}
GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
+jsimd_can_h2v2_fancy_upsample(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
+jsimd_can_h2v1_fancy_upsample(void)
{
return 0;
}
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
}
GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
+jsimd_can_h2v2_merged_upsample(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
+jsimd_can_h2v1_merged_upsample(void)
{
return 0;
}
GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
}
GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
}
GLOBAL(int)
-jsimd_can_convsamp (void)
+jsimd_can_convsamp(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_convsamp_float (void)
+jsimd_can_convsamp_float(void)
{
return 0;
}
GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
{
}
GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
{
}
GLOBAL(int)
-jsimd_can_fdct_islow (void)
+jsimd_can_fdct_islow(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_fdct_ifast (void)
+jsimd_can_fdct_ifast(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_fdct_float (void)
+jsimd_can_fdct_float(void)
{
return 0;
}
GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
+jsimd_fdct_islow(DCTELEM *data)
{
}
GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
+jsimd_fdct_ifast(DCTELEM *data)
{
}
GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
+jsimd_fdct_float(FAST_FLOAT *data)
{
}
GLOBAL(int)
-jsimd_can_quantize (void)
+jsimd_can_quantize(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_quantize_float (void)
+jsimd_can_quantize_float(void)
{
return 0;
}
GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
{
}
GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
{
}
GLOBAL(int)
-jsimd_can_idct_2x2 (void)
+jsimd_can_idct_2x2(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_idct_4x4 (void)
+jsimd_can_idct_4x4(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_idct_6x6 (void)
+jsimd_can_idct_6x6(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_idct_12x12 (void)
+jsimd_can_idct_12x12(void)
{
return 0;
}
GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
}
GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
}
GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
}
GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
}
GLOBAL(int)
-jsimd_can_idct_islow (void)
+jsimd_can_idct_islow(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_idct_ifast (void)
+jsimd_can_idct_ifast(void)
{
return 0;
}
GLOBAL(int)
-jsimd_can_idct_float (void)
+jsimd_can_idct_float(void)
{
return 0;
}
GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
}
GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
}
GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
{
}
GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
+jsimd_can_huff_encode_one_block(void)
{
return 0;
}
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
{
return NULL;
}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
diff --git a/media/libjpeg/jsimddct.h b/media/libjpeg/jsimddct.h
index b19ab48d40..55ee8cf67f 100644
--- a/media/libjpeg/jsimddct.h
+++ b/media/libjpeg/jsimddct.h
@@ -9,66 +9,62 @@
*
*/
-EXTERN(int) jsimd_can_convsamp (void);
-EXTERN(int) jsimd_can_convsamp_float (void);
+EXTERN(int) jsimd_can_convsamp(void);
+EXTERN(int) jsimd_can_convsamp_float(void);
-EXTERN(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace);
-EXTERN(void) jsimd_convsamp_float (JSAMPARRAY sample_data,
- JDIMENSION start_col,
- FAST_FLOAT *workspace);
+EXTERN(void) jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace);
-EXTERN(int) jsimd_can_fdct_islow (void);
-EXTERN(int) jsimd_can_fdct_ifast (void);
-EXTERN(int) jsimd_can_fdct_float (void);
+EXTERN(int) jsimd_can_fdct_islow(void);
+EXTERN(int) jsimd_can_fdct_ifast(void);
+EXTERN(int) jsimd_can_fdct_float(void);
-EXTERN(void) jsimd_fdct_islow (DCTELEM *data);
-EXTERN(void) jsimd_fdct_ifast (DCTELEM *data);
-EXTERN(void) jsimd_fdct_float (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_islow(DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast(DCTELEM *data);
+EXTERN(void) jsimd_fdct_float(FAST_FLOAT *data);
-EXTERN(int) jsimd_can_quantize (void);
-EXTERN(int) jsimd_can_quantize_float (void);
+EXTERN(int) jsimd_can_quantize(void);
+EXTERN(int) jsimd_can_quantize_float(void);
-EXTERN(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace);
-EXTERN(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace);
+EXTERN(void) jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace);
-EXTERN(int) jsimd_can_idct_2x2 (void);
-EXTERN(int) jsimd_can_idct_4x4 (void);
-EXTERN(int) jsimd_can_idct_6x6 (void);
-EXTERN(int) jsimd_can_idct_12x12 (void);
+EXTERN(int) jsimd_can_idct_2x2(void);
+EXTERN(int) jsimd_can_idct_4x4(void);
+EXTERN(int) jsimd_can_idct_6x6(void);
+EXTERN(int) jsimd_can_idct_12x12(void);
-EXTERN(void) jsimd_idct_2x2 (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
-EXTERN(void) jsimd_idct_4x4 (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
-EXTERN(void) jsimd_idct_6x6 (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
-EXTERN(void) jsimd_idct_12x12 (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+EXTERN(void) jsimd_idct_2x2(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_6x6(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
-EXTERN(int) jsimd_can_idct_islow (void);
-EXTERN(int) jsimd_can_idct_ifast (void);
-EXTERN(int) jsimd_can_idct_float (void);
+EXTERN(int) jsimd_can_idct_islow(void);
+EXTERN(int) jsimd_can_idct_ifast(void);
+EXTERN(int) jsimd_can_idct_float(void);
-EXTERN(void) jsimd_idct_islow (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
-EXTERN(void) jsimd_idct_ifast (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
-EXTERN(void) jsimd_idct_float (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+EXTERN(void) jsimd_idct_islow(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_ifast(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_float(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
diff --git a/media/libjpeg/jstdhuff.c b/media/libjpeg/jstdhuff.c
index e202e8e7ec..345b513d4d 100644
--- a/media/libjpeg/jstdhuff.c
+++ b/media/libjpeg/jstdhuff.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2013, D. R. Commander.
+ * Copyright (C) 2013, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -17,8 +17,8 @@
*/
LOCAL(void)
-add_huff_table (j_common_ptr cinfo,
- JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val)
+add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
+ const UINT8 *val)
/* Define a Huffman table */
{
int nsymbols, len;
@@ -29,7 +29,7 @@ add_huff_table (j_common_ptr cinfo,
return;
/* Copy the number-of-symbols-of-each-code-length counts */
- MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+ memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
/* Validate the counts. We do this here mainly so we can copy the right
* number of symbols from the val[] array, without risking marching off
@@ -41,8 +41,9 @@ add_huff_table (j_common_ptr cinfo,
if (nsymbols < 1 || nsymbols > 256)
ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
- MEMCOPY((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
- MEMZERO(&((*htblptr)->huffval[nsymbols]), (256 - nsymbols) * sizeof(UINT8));
+ memcpy((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
+ memset(&((*htblptr)->huffval[nsymbols]), 0,
+ (256 - nsymbols) * sizeof(UINT8));
/* Initialize sent_table FALSE so table will be written to JPEG file. */
(*htblptr)->sent_table = FALSE;
@@ -50,71 +51,79 @@ add_huff_table (j_common_ptr cinfo,
LOCAL(void)
-std_huff_tables (j_common_ptr cinfo)
+std_huff_tables(j_common_ptr cinfo)
/* Set up the standard Huffman tables (cf. JPEG standard section K.3) */
/* IMPORTANT: these are only valid for 8-bit data precision! */
{
JHUFF_TBL **dc_huff_tbl_ptrs, **ac_huff_tbl_ptrs;
- static const UINT8 bits_dc_luminance[17] =
- { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
- static const UINT8 val_dc_luminance[] =
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+ static const UINT8 bits_dc_luminance[17] = {
+ /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0
+ };
+ static const UINT8 val_dc_luminance[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ };
- static const UINT8 bits_dc_chrominance[17] =
- { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
- static const UINT8 val_dc_chrominance[] =
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+ static const UINT8 bits_dc_chrominance[17] = {
+ /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
+ };
+ static const UINT8 val_dc_chrominance[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ };
- static const UINT8 bits_ac_luminance[17] =
- { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
- static const UINT8 val_ac_luminance[] =
- { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
- 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
- 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
- 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
- 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
- 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
- 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
- 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
- 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
- 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
- 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
- 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
- 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
- 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
- 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
- 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
- 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
- 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
- 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
- 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
- 0xf9, 0xfa };
+ static const UINT8 bits_ac_luminance[17] = {
+ /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d
+ };
+ static const UINT8 val_ac_luminance[] = {
+ 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+ 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+ 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+ 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+ 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+ 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+ 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+ 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+ 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+ 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+ 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+ 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+ 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+ 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+ 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+ 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+ 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+ 0xf9, 0xfa
+ };
- static const UINT8 bits_ac_chrominance[17] =
- { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
- static const UINT8 val_ac_chrominance[] =
- { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
- 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
- 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
- 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
- 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
- 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
- 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
- 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
- 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
- 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
- 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
- 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
- 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
- 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
- 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
- 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
- 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
- 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
- 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
- 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
- 0xf9, 0xfa };
+ static const UINT8 bits_ac_chrominance[17] = {
+ /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77
+ };
+ static const UINT8 val_ac_chrominance[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+ 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+ 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+ 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+ 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+ 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+ 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+ 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+ 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+ 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+ 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+ 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+ 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+ 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+ 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+ 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+ 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+ 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+ 0xf9, 0xfa
+ };
if (cinfo->is_decompressor) {
dc_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->dc_huff_tbl_ptrs;
diff --git a/media/libjpeg/jutils.c b/media/libjpeg/jutils.c
index f9d35023e5..d86271624a 100644
--- a/media/libjpeg/jutils.c
+++ b/media/libjpeg/jutils.c
@@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code
- * relevant to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -53,7 +53,7 @@ const int jpeg_zigzag_order[DCTSIZE2] = {
* fake entries.
*/
-const int jpeg_natural_order[DCTSIZE2+16] = {
+const int jpeg_natural_order[DCTSIZE2 + 16] = {
0, 1, 8, 16, 9, 2, 3, 10,
17, 24, 32, 25, 18, 11, 4, 5,
12, 19, 26, 33, 40, 48, 41, 34,
@@ -72,7 +72,7 @@ const int jpeg_natural_order[DCTSIZE2+16] = {
*/
GLOBAL(long)
-jdiv_round_up (long a, long b)
+jdiv_round_up(long a, long b)
/* Compute a/b rounded up to next integer, ie, ceil(a/b) */
/* Assumes a >= 0, b > 0 */
{
@@ -81,7 +81,7 @@ jdiv_round_up (long a, long b)
GLOBAL(long)
-jround_up (long a, long b)
+jround_up(long a, long b)
/* Compute a rounded up to next multiple of b, ie, ceil(a/b)*b */
/* Assumes a >= 0, b > 0 */
{
@@ -91,9 +91,9 @@ jround_up (long a, long b)
GLOBAL(void)
-jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
- JSAMPARRAY output_array, int dest_row,
- int num_rows, JDIMENSION num_cols)
+jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+ JSAMPARRAY output_array, int dest_row, int num_rows,
+ JDIMENSION num_cols)
/* Copy some rows of samples from one place to another.
* num_rows rows are copied from input_array[source_row++]
* to output_array[dest_row++]; these areas may overlap for duplication.
@@ -101,7 +101,7 @@ jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
*/
{
register JSAMPROW inptr, outptr;
- register size_t count = (size_t) (num_cols * sizeof(JSAMPLE));
+ register size_t count = (size_t)(num_cols * sizeof(JSAMPLE));
register int row;
input_array += source_row;
@@ -110,24 +110,24 @@ jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
for (row = num_rows; row > 0; row--) {
inptr = *input_array++;
outptr = *output_array++;
- MEMCOPY(outptr, inptr, count);
+ memcpy(outptr, inptr, count);
}
}
GLOBAL(void)
-jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
- JDIMENSION num_blocks)
+jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+ JDIMENSION num_blocks)
/* Copy a row of coefficient blocks from one place to another. */
{
- MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
+ memcpy(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
}
GLOBAL(void)
-jzero_far (void *target, size_t bytestozero)
+jzero_far(void *target, size_t bytestozero)
/* Zero out a chunk of memory. */
/* This might be sample-array data, block-array data, or alloc_large data. */
{
- MEMZERO(target, bytestozero);
+ memset(target, 0, bytestozero);
}
diff --git a/media/libjpeg/jversion.h b/media/libjpeg/jversion.h
index 6ce663d82a..63db95b99b 100644
--- a/media/libjpeg/jversion.h
+++ b/media/libjpeg/jversion.h
@@ -2,9 +2,9 @@
* jversion.h
*
* This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
* libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2012-2016, D. R. Commander.
+ * Copyright (C) 2010, 2012-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -30,20 +30,25 @@
* NOTE: It is our convention to place the authors in the following order:
* - libjpeg-turbo authors (2009-) in descending order of the date of their
* most recent contribution to the project, then in ascending order of the
- * date of their first contribution to the project
+ * date of their first contribution to the project, then in alphabetical
+ * order
* - Upstream authors in descending order of the date of the first inclusion of
* their code
*/
-#define JCOPYRIGHT "Copyright (C) 2009-2016 D. R. Commander\n" \
- "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
- "Copyright (C) 2015-2016 Matthieu Darbois\n" \
- "Copyright (C) 2015 Google, Inc.\n" \
- "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
- "Copyright (C) 2013 Linaro Limited\n" \
- "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
- "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
- "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
- "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding" \
-
-#define JCOPYRIGHT_SHORT "Copyright (C) 1991-2016 The libjpeg-turbo Project and many others"
+#define JCOPYRIGHT \
+ "Copyright (C) 2009-2022 D. R. Commander\n" \
+ "Copyright (C) 2015, 2020 Google, Inc.\n" \
+ "Copyright (C) 2019-2020 Arm Limited\n" \
+ "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
+ "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
+ "Copyright (C) 2015 Intel Corporation\n" \
+ "Copyright (C) 2013-2014 Linaro Limited\n" \
+ "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
+ "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
+ "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+ "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+ "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
+
+#define JCOPYRIGHT_SHORT \
+ "Copyright (C) 1991-2022 The libjpeg-turbo Project and many others"
diff --git a/media/libjpeg/moz.build b/media/libjpeg/moz.build
index 6519c30fbd..0da04dabc3 100644
--- a/media/libjpeg/moz.build
+++ b/media/libjpeg/moz.build
@@ -1,4 +1,5 @@
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
@@ -22,6 +23,7 @@ SOURCES += [
'jdcolor.c',
'jddctmgr.c',
'jdhuff.c',
+ 'jdicc.c',
'jdinput.c',
'jdmainct.c',
'jdmarker.c',
@@ -54,6 +56,7 @@ SOURCES += [
'jccolor.c',
'jcdctmgr.c',
'jchuff.c',
+ 'jcicc.c',
'jcinit.c',
'jcmainct.c',
'jcmarker.c',
@@ -70,88 +73,188 @@ if CONFIG['LIBJPEG_TURBO_USE_YASM']:
if CONFIG['LIBJPEG_TURBO_ASFLAGS']:
if CONFIG['CPU_ARCH'] == 'arm':
+ LOCAL_INCLUDES += [
+ '/media/libjpeg/simd/arm',
+ '/media/libjpeg/simd/arm/aarch32',
+ ]
SOURCES += [
- 'simd/jsimd_arm.c',
- 'simd/jsimd_arm_neon.S',
+ 'simd/arm/aarch32/jchuff-neon.c',
+ 'simd/arm/aarch32/jsimd.c',
+ 'simd/arm/aarch32/jsimd_neon.S',
+ 'simd/arm/jcgray-neon.c',
+ 'simd/arm/jcphuff-neon.c',
+ 'simd/arm/jcsample-neon.c',
+ 'simd/arm/jdcolor-neon.c',
+ 'simd/arm/jdmerge-neon.c',
+ 'simd/arm/jdsample-neon.c',
+ 'simd/arm/jfdctfst-neon.c',
+ 'simd/arm/jfdctint-neon.c',
+ 'simd/arm/jidctred-neon.c',
+ 'simd/arm/jquanti-neon.c',
]
elif CONFIG['CPU_ARCH'] == 'aarch64':
+ LOCAL_INCLUDES += [
+ '/media/libjpeg/simd/arm',
+ '/media/libjpeg/simd/arm/aarch64',
+ ]
+ SOURCES += [
+ 'simd/arm/aarch64/jsimd.c',
+ 'simd/arm/aarch64/jsimd_neon.S',
+ 'simd/arm/jcgray-neon.c',
+ 'simd/arm/jcphuff-neon.c',
+ 'simd/arm/jcsample-neon.c',
+ 'simd/arm/jdmerge-neon.c',
+ 'simd/arm/jdsample-neon.c',
+ 'simd/arm/jfdctfst-neon.c',
+ 'simd/arm/jidctfst-neon.c',
+ 'simd/arm/jidctred-neon.c',
+ 'simd/arm/jquanti-neon.c',
+ ]
+ elif CONFIG['CPU_ARCH'] == 'mips32':
SOURCES += [
- 'simd/jsimd_arm64.c',
- 'simd/jsimd_arm64_neon.S',
+ 'simd/mips/jsimd.c',
+ 'simd/mips/jsimd_dspr2.S',
]
- elif CONFIG['CPU_ARCH'] == 'mips':
+ if CONFIG['CC_TYPE'] == 'clang':
+ SOURCES['simd/mips/jsimd_dspr2.S'].flags += [
+ '-fno-integrated-as',
+ ]
+ elif CONFIG['CPU_ARCH'] == 'mips64':
+ LOCAL_INCLUDES += ['/media/libjpeg/simd/mips64']
SOURCES += [
- 'simd/jsimd_mips.c',
- 'simd/jsimd_mips_dspr2.S',
+ 'simd/mips64/jccolor-mmi.c',
+ 'simd/mips64/jcgray-mmi.c',
+ 'simd/mips64/jcsample-mmi.c',
+ 'simd/mips64/jdcolor-mmi.c',
+ 'simd/mips64/jdmerge-mmi.c',
+ 'simd/mips64/jdsample-mmi.c',
+ 'simd/mips64/jfdctfst-mmi.c',
+ 'simd/mips64/jfdctint-mmi.c',
+ 'simd/mips64/jidctfst-mmi.c',
+ 'simd/mips64/jidctint-mmi.c',
+ 'simd/mips64/jquanti-mmi.c',
+ 'simd/mips64/jsimd.c',
]
elif CONFIG['CPU_ARCH'] == 'x86_64':
SOURCES += [
- 'simd/jccolor-sse2-64.asm',
- 'simd/jcgray-sse2-64.asm',
- 'simd/jchuff-sse2-64.asm',
- 'simd/jcsample-sse2-64.asm',
- 'simd/jdcolor-sse2-64.asm',
- 'simd/jdmerge-sse2-64.asm',
- 'simd/jdsample-sse2-64.asm',
- 'simd/jfdctflt-sse-64.asm',
- 'simd/jfdctfst-sse2-64.asm',
- 'simd/jfdctint-sse2-64.asm',
- 'simd/jidctflt-sse2-64.asm',
- 'simd/jidctfst-sse2-64.asm',
- 'simd/jidctint-sse2-64.asm',
- 'simd/jidctred-sse2-64.asm',
- 'simd/jquantf-sse2-64.asm',
- 'simd/jquanti-sse2-64.asm',
- 'simd/jsimd_x86_64.c',
+ 'simd/x86_64/jccolor-avx2.asm',
+ 'simd/x86_64/jccolor-sse2.asm',
+ 'simd/x86_64/jcgray-avx2.asm',
+ 'simd/x86_64/jcgray-sse2.asm',
+ 'simd/x86_64/jchuff-sse2.asm',
+ 'simd/x86_64/jcphuff-sse2.asm',
+ 'simd/x86_64/jcsample-avx2.asm',
+ 'simd/x86_64/jcsample-sse2.asm',
+ 'simd/x86_64/jdcolor-avx2.asm',
+ 'simd/x86_64/jdcolor-sse2.asm',
+ 'simd/x86_64/jdmerge-avx2.asm',
+ 'simd/x86_64/jdmerge-sse2.asm',
+ 'simd/x86_64/jdsample-avx2.asm',
+ 'simd/x86_64/jdsample-sse2.asm',
+ 'simd/x86_64/jfdctflt-sse.asm',
+ 'simd/x86_64/jfdctfst-sse2.asm',
+ 'simd/x86_64/jfdctint-avx2.asm',
+ 'simd/x86_64/jfdctint-sse2.asm',
+ 'simd/x86_64/jidctflt-sse2.asm',
+ 'simd/x86_64/jidctfst-sse2.asm',
+ 'simd/x86_64/jidctint-avx2.asm',
+ 'simd/x86_64/jidctint-sse2.asm',
+ 'simd/x86_64/jidctred-sse2.asm',
+ 'simd/x86_64/jquantf-sse2.asm',
+ 'simd/x86_64/jquanti-avx2.asm',
+ 'simd/x86_64/jquanti-sse2.asm',
+ 'simd/x86_64/jsimd.c',
+ 'simd/x86_64/jsimdcpu.asm',
]
elif CONFIG['CPU_ARCH'] == 'x86':
SOURCES += [
- 'simd/jccolor-mmx.asm',
- 'simd/jccolor-sse2.asm',
- 'simd/jcgray-mmx.asm',
- 'simd/jcgray-sse2.asm',
- 'simd/jchuff-sse2.asm',
- 'simd/jcsample-mmx.asm',
- 'simd/jcsample-sse2.asm',
- 'simd/jdcolor-mmx.asm',
- 'simd/jdcolor-sse2.asm',
- 'simd/jdmerge-mmx.asm',
- 'simd/jdmerge-sse2.asm',
- 'simd/jdsample-mmx.asm',
- 'simd/jdsample-sse2.asm',
- 'simd/jfdctflt-3dn.asm',
- 'simd/jfdctflt-sse.asm',
- 'simd/jfdctfst-mmx.asm',
- 'simd/jfdctfst-sse2.asm',
- 'simd/jfdctint-mmx.asm',
- 'simd/jfdctint-sse2.asm',
- 'simd/jidctflt-3dn.asm',
- 'simd/jidctflt-sse.asm',
- 'simd/jidctflt-sse2.asm',
- 'simd/jidctfst-mmx.asm',
- 'simd/jidctfst-sse2.asm',
- 'simd/jidctint-mmx.asm',
- 'simd/jidctint-sse2.asm',
- 'simd/jidctred-mmx.asm',
- 'simd/jidctred-sse2.asm',
- 'simd/jquant-3dn.asm',
- 'simd/jquant-mmx.asm',
- 'simd/jquant-sse.asm',
- 'simd/jquantf-sse2.asm',
- 'simd/jquanti-sse2.asm',
- 'simd/jsimd_i386.c',
- 'simd/jsimdcpu.asm',
+ 'simd/i386/jccolor-avx2.asm',
+ 'simd/i386/jccolor-mmx.asm',
+ 'simd/i386/jccolor-sse2.asm',
+ 'simd/i386/jcgray-avx2.asm',
+ 'simd/i386/jcgray-mmx.asm',
+ 'simd/i386/jcgray-sse2.asm',
+ 'simd/i386/jchuff-sse2.asm',
+ 'simd/i386/jcphuff-sse2.asm',
+ 'simd/i386/jcsample-avx2.asm',
+ 'simd/i386/jcsample-mmx.asm',
+ 'simd/i386/jcsample-sse2.asm',
+ 'simd/i386/jdcolor-avx2.asm',
+ 'simd/i386/jdcolor-mmx.asm',
+ 'simd/i386/jdcolor-sse2.asm',
+ 'simd/i386/jdmerge-avx2.asm',
+ 'simd/i386/jdmerge-mmx.asm',
+ 'simd/i386/jdmerge-sse2.asm',
+ 'simd/i386/jdsample-avx2.asm',
+ 'simd/i386/jdsample-mmx.asm',
+ 'simd/i386/jdsample-sse2.asm',
+ 'simd/i386/jfdctflt-3dn.asm',
+ 'simd/i386/jfdctflt-sse.asm',
+ 'simd/i386/jfdctfst-mmx.asm',
+ 'simd/i386/jfdctfst-sse2.asm',
+ 'simd/i386/jfdctint-avx2.asm',
+ 'simd/i386/jfdctint-mmx.asm',
+ 'simd/i386/jfdctint-sse2.asm',
+ 'simd/i386/jidctflt-3dn.asm',
+ 'simd/i386/jidctflt-sse.asm',
+ 'simd/i386/jidctflt-sse2.asm',
+ 'simd/i386/jidctfst-mmx.asm',
+ 'simd/i386/jidctfst-sse2.asm',
+ 'simd/i386/jidctint-avx2.asm',
+ 'simd/i386/jidctint-mmx.asm',
+ 'simd/i386/jidctint-sse2.asm',
+ 'simd/i386/jidctred-mmx.asm',
+ 'simd/i386/jidctred-sse2.asm',
+ 'simd/i386/jquant-3dn.asm',
+ 'simd/i386/jquant-mmx.asm',
+ 'simd/i386/jquant-sse.asm',
+ 'simd/i386/jquantf-sse2.asm',
+ 'simd/i386/jquanti-avx2.asm',
+ 'simd/i386/jquanti-sse2.asm',
+ 'simd/i386/jsimd.c',
+ 'simd/i386/jsimdcpu.asm',
]
+elif CONFIG['CPU_ARCH'].startswith('ppc'):
+ # PowerPC has no assembly files, but still needs its own headers.
+ LOCAL_INCLUDES += ['/media/libjpeg/simd/powerpc']
+
+ # For libjpeg's internal runtime detection to work, jsimd.c must NOT
+ # be compiled with -maltivec (otherwise it gets statically set),
+ # but everything else should be. If -maltivec was already
+ # specified in .mozconfig, though, then this won't harm anything.
+ ppc_vmx_sources = [
+ 'simd/powerpc/jccolor-altivec.c',
+ 'simd/powerpc/jcgray-altivec.c',
+ 'simd/powerpc/jcsample-altivec.c',
+ 'simd/powerpc/jdcolor-altivec.c',
+ 'simd/powerpc/jdmerge-altivec.c',
+ 'simd/powerpc/jdsample-altivec.c',
+ 'simd/powerpc/jfdctfst-altivec.c',
+ 'simd/powerpc/jfdctint-altivec.c',
+ 'simd/powerpc/jidctfst-altivec.c',
+ 'simd/powerpc/jidctint-altivec.c',
+ 'simd/powerpc/jquanti-altivec.c',
+ ]
+ SOURCES += ppc_vmx_sources
+ SOURCES += [
+ 'simd/powerpc/jsimd.c',
+ ]
+ for srcfile in ppc_vmx_sources:
+ SOURCES[srcfile].flags += CONFIG['PPC_VMX_FLAGS']
else: # No SIMD support?
SOURCES += [
'jsimd_none.c',
]
ASFLAGS += CONFIG['LIBJPEG_TURBO_ASFLAGS']
-ASFLAGS += ['-I%s/media/libjpeg/simd/' % TOPSRCDIR]
-if CONFIG['GKMEDIAS_SHARED_LIBRARY']:
- NO_VISIBILITY_FLAGS = True
+# Make sure the x86 & x86-64 ASM files can see the necessary includes.
+if CONFIG['CPU_ARCH'] == 'x86':
+ ASFLAGS += ['-I%s/media/libjpeg/simd/nasm/' % TOPSRCDIR]
+ ASFLAGS += ['-I%s/media/libjpeg/simd/i386/' % TOPSRCDIR]
+if CONFIG['CPU_ARCH'] == 'x86_64':
+ ASFLAGS += ['-I%s/media/libjpeg/simd/nasm/' % TOPSRCDIR]
+ ASFLAGS += ['-I%s/media/libjpeg/simd/x86_64/' % TOPSRCDIR]
# We allow warnings for third-party code that can be updated from upstream.
ALLOW_COMPILER_WARNINGS = True
diff --git a/media/libjpeg/mozilla.diff b/media/libjpeg/mozilla.diff
index 24b235b401..bc1bcb3066 100644
--- a/media/libjpeg/mozilla.diff
+++ b/media/libjpeg/mozilla.diff
@@ -1,32 +1,7 @@
-diff --git jmemmgr.c jmemmgr.c
---- jmemmgr.c
-+++ jmemmgr.c
-@@ -28,16 +28,17 @@
- */
-
- #define JPEG_INTERNALS
- #define AM_MEMORY_MANAGER /* we define jvirt_Xarray_control structs */
- #include "jinclude.h"
- #include "jpeglib.h"
- #include "jmemsys.h" /* import the system-dependent declarations */
- #include <stdint.h>
-+#include <limits.h> /* some NDKs define SIZE_MAX in limits.h */
-
- #ifndef NO_GETENV
- #ifndef HAVE_STDLIB_H /* <stdlib.h> should declare getenv() */
- extern char *getenv (const char *name);
- #endif
- #endif
-
-
diff --git jmorecfg.h jmorecfg.h
--- jmorecfg.h
+++ jmorecfg.h
-@@ -9,16 +9,17 @@
- * For conditions of distribution and use, see the accompanying README.ijg
- * file.
- *
- * This file contains additional configuration options that customize the
+@@ -13,8 +13,9 @@
* JPEG software for special applications or support machine-dependent
* optimizations. Most users will not need to touch this file.
*/
@@ -35,29 +10,13 @@ diff --git jmorecfg.h jmorecfg.h
/*
* Maximum number of components (color channels) allowed in JPEG image.
- * To meet the letter of the JPEG spec, set this to 255. However, darn
- * few applications need more than 4 channels (maybe 5 for CMYK + alpha
- * mask). We recommend 10 as a reasonable compromise; use 4 if you are
- * really short on memory. (Each allowed component costs a hundred or so
- * bytes of storage, whether actually used in an image or not.)
-@@ -118,39 +119,25 @@ typedef char JOCTET;
- * They must be at least as wide as specified; but making them too big
- * won't cost a huge amount of memory, so we don't provide special
- * extraction code like we did for JSAMPLE. (In other words, these
- * typedefs live at a different point on the speed/space tradeoff curve.)
+ * To meet the letter of Rec. ITU-T T.81 | ISO/IEC 10918-1, set this to 255.
+@@ -95,23 +96,17 @@ typedef unsigned char JOCTET;
*/
/* UINT8 must hold at least the values 0..255. */
--#ifdef HAVE_UNSIGNED_CHAR
-typedef unsigned char UINT8;
--#else /* not HAVE_UNSIGNED_CHAR */
--#ifdef __CHAR_UNSIGNED__
--typedef char UINT8;
--#else /* not __CHAR_UNSIGNED__ */
--typedef short UINT8;
--#endif /* __CHAR_UNSIGNED__ */
--#endif /* HAVE_UNSIGNED_CHAR */
+typedef uint8_t UINT8;
/* UINT16 must hold at least the values 0..65535. */
@@ -79,23 +38,15 @@ diff --git jmorecfg.h jmorecfg.h
/* INT32 must hold at least signed 32-bit values.
*
* NOTE: The INT32 typedef dates back to libjpeg v5 (1994.) Integers were
- * sometimes 16-bit back then (MS-DOS), which is why INT32 is typedef'd to
- * long. It also wasn't common (or at least as common) in 1994 for INT32 to be
- * defined by platform headers. Since then, however, INT32 is defined in
- * several other common places:
-@@ -167,25 +154,17 @@ typedef short INT16;
- * This is a recipe for conflict, since "long" and "int" aren't always
- * compatible types. Since the definition of INT32 has technically been part
- * of the libjpeg API for more than 20 years, we can't remove it, but we do not
- * use it internally any longer. We instead define a separate type (JLONG)
+@@ -136,17 +131,9 @@ typedef short INT16;
* for internal use, which ensures that internal behavior will always be the
* same regardless of any external headers that may be included.
*/
-#ifndef XMD_H /* X11/xmd.h correctly defines INT32 */
--#ifndef _BASETSD_H_ /* Microsoft defines it in basetsd.h */
--#ifndef _BASETSD_H /* MinGW is slightly different */
--#ifndef QGLOBAL_H /* Qt defines it in qglobal.h */
+-#ifndef _BASETSD_H_ /* Microsoft defines it in basetsd.h */
+-#ifndef _BASETSD_H /* MinGW is slightly different */
+-#ifndef QGLOBAL_H /* Qt defines it in qglobal.h */
-typedef long INT32;
-#endif
-#endif
@@ -106,7 +57,3 @@ diff --git jmorecfg.h jmorecfg.h
/* Datatype used for image dimensions. The JPEG standard only supports
* images up to 64K*64K due to 16-bit fields in SOF markers. Therefore
* "unsigned int" is sufficient on all machines. However, if you need to
- * handle larger images and you don't mind deviating from the spec, you
- * can change this datatype. (Note that changing this datatype will
- * potentially require modifying the SIMD code. The x86-64 SIMD extensions,
- * in particular, assume a 32-bit JDIMENSION.)
diff --git a/media/libjpeg/simd/arm/aarch32/jccolext-neon.c b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
new file mode 100644
index 0000000000..362102d2b2
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,148 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb, and Cr output data */
+ JSAMPROW outptr0, outptr1, outptr2;
+ /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+
+ /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+ const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+ const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+ const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+ const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining > 0; cols_remaining -= 8) {
+
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 8) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ if (cols_remaining < 8) {
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+ }
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+ uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_low = scaled_128_5;
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+ cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+ uint32x4_t cb_high = scaled_128_5;
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+ cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_low = scaled_128_5;
+ cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+ uint32x4_t cr_high = scaled_128_5;
+ cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+ vrshrn_n_u32(y_high, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+ vshrn_n_u32(cb_high, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+ vshrn_n_u32(cr_high, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+ /* Increment pointers. */
+ inptr += (8 * RGB_PIXELSIZE);
+ outptr0 += 8;
+ outptr1 += 8;
+ outptr2 += 8;
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jchuff-neon.c b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 0000000000..19d94f720d
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ uint8_t block_nbits[DCTSIZE2];
+ uint16_t block_diff[DCTSIZE2];
+
+ /* Load rows of coefficients from DCT block in zig-zag order. */
+
+ /* Compute DC coefficient difference value. (F.1.1.5.1) */
+ int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+ row0 = vld1q_lane_s16(block + 1, row0, 1);
+ row0 = vld1q_lane_s16(block + 8, row0, 2);
+ row0 = vld1q_lane_s16(block + 16, row0, 3);
+ row0 = vld1q_lane_s16(block + 9, row0, 4);
+ row0 = vld1q_lane_s16(block + 2, row0, 5);
+ row0 = vld1q_lane_s16(block + 3, row0, 6);
+ row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+ int16x8_t row1 = vld1q_dup_s16(block + 17);
+ row1 = vld1q_lane_s16(block + 24, row1, 1);
+ row1 = vld1q_lane_s16(block + 32, row1, 2);
+ row1 = vld1q_lane_s16(block + 25, row1, 3);
+ row1 = vld1q_lane_s16(block + 18, row1, 4);
+ row1 = vld1q_lane_s16(block + 11, row1, 5);
+ row1 = vld1q_lane_s16(block + 4, row1, 6);
+ row1 = vld1q_lane_s16(block + 5, row1, 7);
+
+ int16x8_t row2 = vld1q_dup_s16(block + 12);
+ row2 = vld1q_lane_s16(block + 19, row2, 1);
+ row2 = vld1q_lane_s16(block + 26, row2, 2);
+ row2 = vld1q_lane_s16(block + 33, row2, 3);
+ row2 = vld1q_lane_s16(block + 40, row2, 4);
+ row2 = vld1q_lane_s16(block + 48, row2, 5);
+ row2 = vld1q_lane_s16(block + 41, row2, 6);
+ row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+ int16x8_t row3 = vld1q_dup_s16(block + 27);
+ row3 = vld1q_lane_s16(block + 20, row3, 1);
+ row3 = vld1q_lane_s16(block + 13, row3, 2);
+ row3 = vld1q_lane_s16(block + 6, row3, 3);
+ row3 = vld1q_lane_s16(block + 7, row3, 4);
+ row3 = vld1q_lane_s16(block + 14, row3, 5);
+ row3 = vld1q_lane_s16(block + 21, row3, 6);
+ row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+ int16x8_t abs_row0 = vabsq_s16(row0);
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+
+ int16x8_t row0_lz = vclzq_s16(abs_row0);
+ int16x8_t row1_lz = vclzq_s16(abs_row1);
+ int16x8_t row2_lz = vclzq_s16(abs_row2);
+ int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+ /* Compute number of bits required to represent each coefficient. */
+ uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+ uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+ uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+ uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+ vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+ vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+ vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+ vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+ uint16x8_t row0_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+ vnegq_s16(row0_lz));
+ uint16x8_t row1_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+ vnegq_s16(row1_lz));
+ uint16x8_t row2_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+ vnegq_s16(row2_lz));
+ uint16x8_t row3_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+ vnegq_s16(row3_lz));
+
+ uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+ /* Store diff values for rows 0, 1, 2, and 3. */
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+ /* Load last four rows of coefficients from DCT block in zig-zag order. */
+ int16x8_t row4 = vld1q_dup_s16(block + 35);
+ row4 = vld1q_lane_s16(block + 42, row4, 1);
+ row4 = vld1q_lane_s16(block + 49, row4, 2);
+ row4 = vld1q_lane_s16(block + 56, row4, 3);
+ row4 = vld1q_lane_s16(block + 57, row4, 4);
+ row4 = vld1q_lane_s16(block + 50, row4, 5);
+ row4 = vld1q_lane_s16(block + 43, row4, 6);
+ row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+ int16x8_t row5 = vld1q_dup_s16(block + 29);
+ row5 = vld1q_lane_s16(block + 22, row5, 1);
+ row5 = vld1q_lane_s16(block + 15, row5, 2);
+ row5 = vld1q_lane_s16(block + 23, row5, 3);
+ row5 = vld1q_lane_s16(block + 30, row5, 4);
+ row5 = vld1q_lane_s16(block + 37, row5, 5);
+ row5 = vld1q_lane_s16(block + 44, row5, 6);
+ row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+ int16x8_t row6 = vld1q_dup_s16(block + 58);
+ row6 = vld1q_lane_s16(block + 59, row6, 1);
+ row6 = vld1q_lane_s16(block + 52, row6, 2);
+ row6 = vld1q_lane_s16(block + 45, row6, 3);
+ row6 = vld1q_lane_s16(block + 38, row6, 4);
+ row6 = vld1q_lane_s16(block + 31, row6, 5);
+ row6 = vld1q_lane_s16(block + 39, row6, 6);
+ row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+ int16x8_t row7 = vld1q_dup_s16(block + 53);
+ row7 = vld1q_lane_s16(block + 60, row7, 1);
+ row7 = vld1q_lane_s16(block + 61, row7, 2);
+ row7 = vld1q_lane_s16(block + 54, row7, 3);
+ row7 = vld1q_lane_s16(block + 47, row7, 4);
+ row7 = vld1q_lane_s16(block + 55, row7, 5);
+ row7 = vld1q_lane_s16(block + 62, row7, 6);
+ row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+
+ int16x8_t row4_lz = vclzq_s16(abs_row4);
+ int16x8_t row5_lz = vclzq_s16(abs_row5);
+ int16x8_t row6_lz = vclzq_s16(abs_row6);
+ int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+ /* Compute number of bits required to represent each coefficient. */
+ uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+ uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+ uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+ uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+ vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+ vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+ vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+ vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+ uint16x8_t row4_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+ vnegq_s16(row4_lz));
+ uint16x8_t row5_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+ vnegq_s16(row5_lz));
+ uint16x8_t row6_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+ vnegq_s16(row6_lz));
+ uint16x8_t row7_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+ vnegq_s16(row7_lz));
+
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+ /* Store diff values for rows 4, 5, 6, and 7. */
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ /* Construct bitmap to accelerate encoding of AC coefficients. A set bit
+ * means that the corresponding coefficient != 0.
+ */
+ uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+ uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+ uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+ uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+ uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+ uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+ uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+ uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+ /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+ row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+ row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+ row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+ row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+ row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+ row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+ row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+ row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+ uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+ uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+ uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+ uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+ uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+ uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+ /* Shift left to remove DC bit. */
+ bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+ /* Move bitmap to 32-bit scalar registers. */
+ uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+ uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+ /* Set up state and bit buffer for output bitstream. */
+ working_state *state_ptr = (working_state *)state;
+ int free_bits = state_ptr->cur.free_bits;
+ size_t put_buffer = state_ptr->cur.put_buffer;
+
+ /* Encode DC coefficient. */
+
+ unsigned int nbits = block_nbits[0];
+ /* Emit Huffman-coded symbol and additional diff bits. */
+ unsigned int diff = block_diff[0];
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+ /* Encode AC coefficients. */
+
+ unsigned int r = 0; /* r = run length of zeros */
+ unsigned int i = 1; /* i = number of coefficients encoded */
+ /* Code and size information for a run length of 16 zero coefficients */
+ const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+ const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+ while (bitmap_1_32 != 0) {
+ r = BUILTIN_CLZ(bitmap_1_32);
+ i += r;
+ bitmap_1_32 <<= r;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap_1_32 <<= 1;
+ }
+
+ r = 33 - i;
+ i = 33;
+
+ while (bitmap_33_63 != 0) {
+ unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+ r += leading_zeros;
+ i += leading_zeros;
+ bitmap_33_63 <<= leading_zeros;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ r = 0;
+ i++;
+ bitmap_33_63 <<= 1;
+ }
+
+ /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+ * The value of RS for the EOB code is 0.
+ */
+ if (i != 64) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+
+ state_ptr->cur.put_buffer = put_buffer;
+ state_ptr->cur.free_bits = free_bits;
+
+ return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd.c b/media/libjpeg/simd/arm/aarch32/jsimd.c
new file mode 100644
index 0000000000..e3adf23d50
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd.c
@@ -0,0 +1,980 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+ char *p;
+
+ if (*feature == 0)
+ return 0;
+ if (strncmp(buffer, "Features", 8) != 0)
+ return 0;
+ buffer += 8;
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'feature' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, feature))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(feature);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ simd_support = 0;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_feature(buffer, "neon"))
+ simd_support |= JSIMD_NEON;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__ARM_NEON__)
+ simd_support |= JSIMD_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ /* We still have a chance to use Neon regardless of globally used
+ * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+ * /proc/cpuinfo parsing on linux/android */
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+ simd_support = JSIMD_NEON;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_ycc_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_ycc_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_ycc_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_ycc_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_ycc_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_gray_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_gray_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_gray_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_ycc_extrgbx_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_ycc_extbgr_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_ycc_extbgrx_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_ycc_extxbgr_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_ycc_extxrgb_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+ output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON && simd_huffman)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+ jpeg_natural_order_start, Sl,
+ Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd_neon.S b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
new file mode 100644
index 0000000000..7e1e2b1451
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
@@ -0,0 +1,1200 @@
+/*
+ * Armv7 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ * All Rights Reserved.
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+ .private_extern _\fname
+ .globl _\fname
+_\fname:
+#else
+ .global \fname
+#ifdef __ELF__
+ .hidden \fname
+ .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE 128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336 (2446)
+#define FIX_0_390180644 (3196)
+#define FIX_0_541196100 (4433)
+#define FIX_0_765366865 (6270)
+#define FIX_0_899976223 (7373)
+#define FIX_1_175875602 (9633)
+#define FIX_1_501321110 (12299)
+#define FIX_1_847759065 (15137)
+#define FIX_1_961570560 (16069)
+#define FIX_2_053119869 (16819)
+#define FIX_2_562915447 (20995)
+#define FIX_3_072711026 (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+ DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+ JLONG q1, q2, q3, q4, q5, q6, q7; \
+ JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
+ \
+ /* 1-D iDCT input data */ \
+ row0 = xrow0; \
+ row1 = xrow1; \
+ row2 = xrow2; \
+ row3 = xrow3; \
+ row4 = xrow4; \
+ row5 = xrow5; \
+ row6 = xrow6; \
+ row7 = xrow7; \
+ \
+ q5 = row7 + row3; \
+ q4 = row5 + row1; \
+ q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+ MULTIPLY(q4, FIX_1_175875602); \
+ q7 = MULTIPLY(q5, FIX_1_175875602) + \
+ MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+ q2 = MULTIPLY(row2, FIX_0_541196100) + \
+ MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+ q4 = q6; \
+ q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+ q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+ MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+ /* now we can use q1 (reloadable constants have been used up) */ \
+ q1 = q3 + q2; \
+ q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+ MULTIPLY(row1, -FIX_0_899976223); \
+ q5 = q7; \
+ q1 = q1 + q6; \
+ q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+ MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+ \
+ /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+ tmp11_plus_tmp2 = q1; \
+ row1 = 0; \
+ \
+ q1 = q1 - q6; \
+ q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+ MULTIPLY(row3, -FIX_2_562915447); \
+ q1 = q1 - q6; \
+ q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+ MULTIPLY(row6, FIX_0_541196100); \
+ q3 = q3 - q2; \
+ \
+ /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+ tmp11_minus_tmp2 = q1; \
+ \
+ q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+ q2 = q1 + q6; \
+ q1 = q1 - q6; \
+ \
+ /* pick up the results */ \
+ tmp0 = q4; \
+ tmp1 = q5; \
+ tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+ tmp3 = q7; \
+ tmp10 = q2; \
+ tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+ tmp12 = q3; \
+ tmp13 = q1; \
+}
+
+#define XFIX_0_899976223 d0[0]
+#define XFIX_0_541196100 d0[1]
+#define XFIX_2_562915447 d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
+#define XFIX_1_175875602 d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+ .short FIX_0_899976223 /* d0[0] */
+ .short FIX_0_541196100 /* d0[1] */
+ .short FIX_2_562915447 /* d0[2] */
+ .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
+ .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
+ .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
+ .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
+ .short FIX_1_175875602 /* d1[3] */
+ /* reloadable constants */
+ .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
+ .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
+ .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
+ .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+ DCT_TABLE .req r0
+ COEF_BLOCK .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_COL .req r3
+ TMP1 .req r0
+ TMP2 .req r1
+ TMP3 .req r2
+ TMP4 .req ip
+
+ ROW0L .req d16
+ ROW0R .req d17
+ ROW1L .req d18
+ ROW1R .req d19
+ ROW2L .req d20
+ ROW2R .req d21
+ ROW3L .req d22
+ ROW3R .req d23
+ ROW4L .req d24
+ ROW4R .req d25
+ ROW5L .req d26
+ ROW5R .req d27
+ ROW6L .req d28
+ ROW6R .req d29
+ ROW7L .req d30
+ ROW7R .req d31
+
+ /* Load and dequantize coefficients into Neon registers
+ * with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d16 | d17 ( q8 )
+ * 1 | d18 | d19 ( q9 )
+ * 2 | d20 | d21 ( q10 )
+ * 3 | d22 | d23 ( q11 )
+ * 4 | d24 | d25 ( q12 )
+ * 5 | d26 | d27 ( q13 )
+ * 6 | d28 | d29 ( q14 )
+ * 7 | d30 | d31 ( q15 )
+ */
+ adr ip, jsimd_idct_islow_neon_consts
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+ vmul.s16 q8, q8, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q9, q9, q1
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+ vmul.s16 q10, q10, q2
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vmul.s16 q11, q11, q3
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+ vmul.s16 q12, q12, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q14, q14, q2
+ vmul.s16 q13, q13, q1
+ vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
+ add ip, ip, #16
+ vmul.s16 q15, q15, q3
+ vpush {d8 - d15} /* save Neon registers */
+ /* 1-D IDCT, pass 1, left 4x8 half */
+ vadd.s16 d4, ROW7L, ROW3L
+ vadd.s16 d5, ROW5L, ROW1L
+ vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, d5, XFIX_1_175875602
+ vmull.s16 q7, d4, XFIX_1_175875602
+ /* Check for the zero coefficients in the right 4x8 half */
+ push {r4, r5}
+ vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
+ vsubl.s16 q3, ROW0L, ROW4L
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+ orr r0, r4, r5
+ vmov q4, q6
+ vmlsl.s16 q6, ROW5L, XFIX_2_562915447
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vshl.s32 q3, q3, #13
+ orr r0, r0, r4
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ orr r0, r0, r5
+ vadd.s32 q1, q3, q2
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ orr r0, r0, r4
+ vmlsl.s16 q7, ROW7L, XFIX_0_899976223
+ orr r0, r0, r5
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vrshrn.s32 ROW1L, q1, #11
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+ orr r0, r0, r4
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ orr r0, r0, r5
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+ vmlal.s16 q6, ROW6L, XFIX_0_541196100
+ vsub.s32 q3, q3, q2
+ orr r0, r0, r4
+ vrshrn.s32 ROW6L, q1, #11
+ orr r0, r0, r5
+ vadd.s32 q1, q3, q5
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW0L, ROW4L
+ orr r0, r0, r4
+ vrshrn.s32 ROW2L, q1, #11
+ orr r0, r0, r5
+ vrshrn.s32 ROW5L, q3, #11
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+ orr r0, r0, r4
+ vadd.s32 q2, q5, q6
+ orrs r0, r0, r5
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ orr r0, r4, r5
+ vsub.s32 q3, q1, q4
+ pop {r4, r5}
+ vrshrn.s32 ROW7L, q2, #11
+ vrshrn.s32 ROW3L, q5, #11
+ vrshrn.s32 ROW0L, q6, #11
+ vrshrn.s32 ROW4L, q3, #11
+
+ beq 3f /* Go to do some special handling for the sparse
+ right 4x8 half */
+
+ /* 1-D IDCT, pass 1, right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vadd.s16 d10, ROW7R, ROW3R
+ vadd.s16 d8, ROW5R, ROW1R
+ /* Transpose left 4x8 half */
+ vtrn.16 ROW6L, ROW7L
+ vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, d8, XFIX_1_175875602
+ vtrn.16 ROW2L, ROW3L
+ vmull.s16 q7, d10, XFIX_1_175875602
+ vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
+ vtrn.16 ROW0L, ROW1L
+ vsubl.s16 q3, ROW0R, ROW4R
+ vmull.s16 q2, ROW2R, XFIX_0_541196100
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+ vtrn.16 ROW4L, ROW5L
+ vmov q4, q6
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
+ vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+ vtrn.32 ROW1L, ROW3L
+ vshl.s32 q3, q3, #13
+ vmlsl.s16 q4, ROW1R, XFIX_0_899976223
+ vtrn.32 ROW4L, ROW6L
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ vtrn.32 ROW0L, ROW2L
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
+ vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+ vrshrn.s32 ROW1R, q1, #11
+ vtrn.32 ROW5L, ROW7L
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+ vmlsl.s16 q5, ROW3R, XFIX_2_562915447
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
+ vsub.s32 q3, q3, q2
+ vrshrn.s32 ROW6R, q1, #11
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW0R, ROW4R
+ vrshrn.s32 ROW2R, q1, #11
+ vrshrn.s32 ROW5R, q3, #11
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vrshrn.s32 ROW7R, q2, #11
+ vrshrn.s32 ROW3R, q5, #11
+ vrshrn.s32 ROW0R, q6, #11
+ vrshrn.s32 ROW4R, q3, #11
+ /* Transpose right 4x8 half */
+ vtrn.16 ROW6R, ROW7R
+ vtrn.16 ROW2R, ROW3R
+ vtrn.16 ROW0R, ROW1R
+ vtrn.16 ROW4R, ROW5R
+ vtrn.32 ROW1R, ROW3R
+ vtrn.32 ROW4R, ROW6R
+ vtrn.32 ROW0R, ROW2R
+ vtrn.32 ROW5R, ROW7R
+
+1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW1L, XFIX_1_175875602
+ vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW3L, XFIX_1_175875602
+ vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+ vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
+ vmov q4, q6
+ vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vshl.s32 q3, q3, #13
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vshrn.s32 ROW1L, q1, #16
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW2L, q1, #16
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW3L, q5, #16
+ vshrn.s32 ROW0L, q6, #16
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
+ /* 1-D IDCT, pass 2, right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW5R, XFIX_1_175875602
+ vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
+ vmull.s16 q7, ROW7R, XFIX_1_175875602
+ vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
+ vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
+ vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+ vmov q4, q6
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
+ vshl.s32 q3, q3, #13
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW6R, q1, #16
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
+ vshrn.s32 ROW5R, q3, #16
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW7R, q2, #16
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW4R, q3, #16
+
+2: /* Descale to 8-bit and range limit */
+ vqrshrn.s16 d16, q8, #2
+ vqrshrn.s16 d17, q9, #2
+ vqrshrn.s16 d18, q10, #2
+ vqrshrn.s16 d19, q11, #2
+ vpop {d8 - d15} /* restore Neon registers */
+ vqrshrn.s16 d20, q12, #2
+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+ vtrn.16 q8, q9
+ vqrshrn.s16 d21, q13, #2
+ vqrshrn.s16 d22, q14, #2
+ vmov.u8 q0, #(CENTERJSAMPLE)
+ vqrshrn.s16 d23, q15, #2
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
+ vadd.u8 q8, q8, q0
+ vadd.u8 q9, q9, q0
+ vtrn.16 q10, q11
+ /* Store results to the output buffer */
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d16}, [TMP1]
+ vtrn.8 d20, d21
+ vst1.8 {d17}, [TMP2]
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d18}, [TMP1]
+ vadd.u8 q10, q10, q0
+ vst1.8 {d19}, [TMP2]
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ add TMP3, TMP3, OUTPUT_COL
+ add TMP4, TMP4, OUTPUT_COL
+ vtrn.8 d22, d23
+ vst1.8 {d20}, [TMP1]
+ vadd.u8 q11, q11, q0
+ vst1.8 {d21}, [TMP2]
+ vst1.8 {d22}, [TMP3]
+ vst1.8 {d23}, [TMP4]
+ bx lr
+
+3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+ /* Transpose left 4x8 half */
+ vtrn.16 ROW6L, ROW7L
+ vtrn.16 ROW2L, ROW3L
+ vtrn.16 ROW0L, ROW1L
+ vtrn.16 ROW4L, ROW5L
+ vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
+ vtrn.32 ROW1L, ROW3L
+ vtrn.32 ROW4L, ROW6L
+ vtrn.32 ROW0L, ROW2L
+ vtrn.32 ROW5L, ROW7L
+
+ cmp r0, #0
+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
+ pass */
+
+ /* Only row 0 is non-zero for the right 4x8 half */
+ vdup.s16 ROW1R, ROW0R[1]
+ vdup.s16 ROW2R, ROW0R[2]
+ vdup.s16 ROW3R, ROW0R[3]
+ vdup.s16 ROW4R, ROW0R[0]
+ vdup.s16 ROW5R, ROW0R[1]
+ vdup.s16 ROW6R, ROW0R[2]
+ vdup.s16 ROW7R, ROW0R[3]
+ vdup.s16 ROW0R, ROW0R[0]
+ b 1b /* Go to 'normal' second pass */
+
+4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW1L, XFIX_1_175875602
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW3L, XFIX_1_175875602
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vshll.s16 q3, ROW0L, #13
+ vmov q4, q6
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vadd.s32 q1, q1, q6
+ vadd.s32 q6, q6, q6
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ vshrn.s32 ROW1L, q1, #16
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vshll.s16 q5, ROW0L, #13
+ vshrn.s32 ROW2L, q1, #16
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW3L, q5, #16
+ vshrn.s32 ROW0L, q6, #16
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW5L, XFIX_1_175875602
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW7L, XFIX_1_175875602
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+ vmull.s16 q2, ROW6L, XFIX_0_541196100
+ vshll.s16 q3, ROW4L, #13
+ vmov q4, q6
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+ vadd.s32 q1, q1, q6
+ vadd.s32 q6, q6, q6
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW6R, q1, #16
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vshll.s16 q5, ROW4L, #13
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
+ vshrn.s32 ROW5R, q3, #16
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW7R, q2, #16
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW4R, q3, #16
+ b 2b /* Go to epilogue */
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+
+ .unreq ROW0L
+ .unreq ROW0R
+ .unreq ROW1L
+ .unreq ROW1R
+ .unreq ROW2L
+ .unreq ROW2R
+ .unreq ROW3L
+ .unreq ROW3R
+ .unreq ROW4L
+ .unreq ROW4R
+ .unreq ROW5L
+ .unreq ROW5R
+ .unreq ROW6L
+ .unreq ROW6R
+ .unreq ROW7L
+ .unreq ROW7R
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in Arm Neon case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200 d0[0]
+#define XFIX_1_414213562 d0[1]
+#define XFIX_1_847759065 d0[2]
+#define XFIX_2_613125930 d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+ DCT_TABLE .req r0
+ COEF_BLOCK .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_COL .req r3
+ TMP1 .req r0
+ TMP2 .req r1
+ TMP3 .req r2
+ TMP4 .req ip
+
+ /* Load and dequantize coefficients into Neon registers
+ * with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d16 | d17 ( q8 )
+ * 1 | d18 | d19 ( q9 )
+ * 2 | d20 | d21 ( q10 )
+ * 3 | d22 | d23 ( q11 )
+ * 4 | d24 | d25 ( q12 )
+ * 5 | d26 | d27 ( q13 )
+ * 6 | d28 | d29 ( q14 )
+ * 7 | d30 | d31 ( q15 )
+ */
+ adr ip, jsimd_idct_ifast_neon_consts
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+ vmul.s16 q8, q8, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q9, q9, q1
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+ vmul.s16 q10, q10, q2
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vmul.s16 q11, q11, q3
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+ vmul.s16 q12, q12, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q14, q14, q2
+ vmul.s16 q13, q13, q1
+ vld1.16 {d0}, [ip, :64] /* load constants */
+ vmul.s16 q15, q15, q3
+ vpush {d8 - d13} /* save Neon registers */
+ /* 1-D IDCT, pass 1 */
+ vsub.s16 q2, q10, q14
+ vadd.s16 q14, q10, q14
+ vsub.s16 q1, q11, q13
+ vadd.s16 q13, q11, q13
+ vsub.s16 q5, q9, q15
+ vadd.s16 q15, q9, q15
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
+ vadd.s16 q3, q1, q1
+ vsub.s16 q1, q5, q1
+ vadd.s16 q10, q2, q4
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
+ vsub.s16 q2, q15, q13
+ vadd.s16 q3, q3, q6
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
+ vadd.s16 q1, q1, q4
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
+ vsub.s16 q10, q10, q14
+ vadd.s16 q2, q2, q6
+ vsub.s16 q6, q8, q12
+ vadd.s16 q12, q8, q12
+ vadd.s16 q9, q5, q4
+ vadd.s16 q5, q6, q10
+ vsub.s16 q10, q6, q10
+ vadd.s16 q6, q15, q13
+ vadd.s16 q8, q12, q14
+ vsub.s16 q3, q6, q3
+ vsub.s16 q12, q12, q14
+ vsub.s16 q3, q3, q1
+ vsub.s16 q1, q9, q1
+ vadd.s16 q2, q3, q2
+ vsub.s16 q15, q8, q6
+ vadd.s16 q1, q1, q2
+ vadd.s16 q8, q8, q6
+ vadd.s16 q14, q5, q3
+ vsub.s16 q9, q5, q3
+ vsub.s16 q13, q10, q2
+ vadd.s16 q10, q10, q2
+ /* Transpose */
+ vtrn.16 q8, q9
+ vsub.s16 q11, q12, q1
+ vtrn.16 q14, q15
+ vadd.s16 q12, q12, q1
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q8, q10
+ vtrn.32 q13, q15
+ vswp d28, d21
+ vswp d26, d19
+ /* 1-D IDCT, pass 2 */
+ vsub.s16 q2, q10, q14
+ vswp d30, d23
+ vadd.s16 q14, q10, q14
+ vswp d24, d17
+ vsub.s16 q1, q11, q13
+ vadd.s16 q13, q11, q13
+ vsub.s16 q5, q9, q15
+ vadd.s16 q15, q9, q15
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
+ vadd.s16 q3, q1, q1
+ vsub.s16 q1, q5, q1
+ vadd.s16 q10, q2, q4
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
+ vsub.s16 q2, q15, q13
+ vadd.s16 q3, q3, q6
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
+ vadd.s16 q1, q1, q4
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
+ vsub.s16 q10, q10, q14
+ vadd.s16 q2, q2, q6
+ vsub.s16 q6, q8, q12
+ vadd.s16 q12, q8, q12
+ vadd.s16 q9, q5, q4
+ vadd.s16 q5, q6, q10
+ vsub.s16 q10, q6, q10
+ vadd.s16 q6, q15, q13
+ vadd.s16 q8, q12, q14
+ vsub.s16 q3, q6, q3
+ vsub.s16 q12, q12, q14
+ vsub.s16 q3, q3, q1
+ vsub.s16 q1, q9, q1
+ vadd.s16 q2, q3, q2
+ vsub.s16 q15, q8, q6
+ vadd.s16 q1, q1, q2
+ vadd.s16 q8, q8, q6
+ vadd.s16 q14, q5, q3
+ vsub.s16 q9, q5, q3
+ vsub.s16 q13, q10, q2
+ vpop {d8 - d13} /* restore Neon registers */
+ vadd.s16 q10, q10, q2
+ vsub.s16 q11, q12, q1
+ vadd.s16 q12, q12, q1
+ /* Descale to 8-bit and range limit */
+ vmov.u8 q0, #0x80
+ vqshrn.s16 d16, q8, #5
+ vqshrn.s16 d17, q9, #5
+ vqshrn.s16 d18, q10, #5
+ vqshrn.s16 d19, q11, #5
+ vqshrn.s16 d20, q12, #5
+ vqshrn.s16 d21, q13, #5
+ vqshrn.s16 d22, q14, #5
+ vqshrn.s16 d23, q15, #5
+ vadd.u8 q8, q8, q0
+ vadd.u8 q9, q9, q0
+ vadd.u8 q10, q10, q0
+ vadd.u8 q11, q11, q0
+ /* Transpose the final 8-bit samples */
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
+ /* Store results to the output buffer */
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d16}, [TMP1]
+ vst1.8 {d17}, [TMP2]
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d18}, [TMP1]
+ vtrn.8 d20, d21
+ vst1.8 {d19}, [TMP2]
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ add TMP3, TMP3, OUTPUT_COL
+ add TMP4, TMP4, OUTPUT_COL
+ vst1.8 {d20}, [TMP1]
+ vtrn.8 d22, d23
+ vst1.8 {d21}, [TMP2]
+ vst1.8 {d22}, [TMP3]
+ vst1.8 {d23}, [TMP4]
+ bx lr
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+ .if \size == 8
+ vst1.8 {d20}, [Y]!
+ vst1.8 {d21}, [U]!
+ vst1.8 {d22}, [V]!
+ .elseif \size == 4
+ vst1.8 {d20[0]}, [Y]!
+ vst1.8 {d20[1]}, [Y]!
+ vst1.8 {d20[2]}, [Y]!
+ vst1.8 {d20[3]}, [Y]!
+ vst1.8 {d21[0]}, [U]!
+ vst1.8 {d21[1]}, [U]!
+ vst1.8 {d21[2]}, [U]!
+ vst1.8 {d21[3]}, [U]!
+ vst1.8 {d22[0]}, [V]!
+ vst1.8 {d22[1]}, [V]!
+ vst1.8 {d22[2]}, [V]!
+ vst1.8 {d22[3]}, [V]!
+ .elseif \size == 2
+ vst1.8 {d20[4]}, [Y]!
+ vst1.8 {d20[5]}, [Y]!
+ vst1.8 {d21[4]}, [U]!
+ vst1.8 {d21[5]}, [U]!
+ vst1.8 {d22[4]}, [V]!
+ vst1.8 {d22[5]}, [V]!
+ .elseif \size == 1
+ vst1.8 {d20[6]}, [Y]!
+ vst1.8 {d21[6]}, [U]!
+ vst1.8 {d22[6]}, [V]!
+ .else
+ .error unsupported macroblock size
+ .endif
+.endm
+
+.macro do_load bpp, size
+ .if \bpp == 24
+ .if \size == 8
+ vld3.8 {d10, d11, d12}, [RGB]!
+ pld [RGB, #128]
+ .elseif \size == 4
+ vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
+ vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
+ vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
+ vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
+ .elseif \size == 2
+ vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
+ vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
+ .elseif \size == 1
+ vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif \bpp == 32
+ .if \size == 8
+ vld4.8 {d10, d11, d12, d13}, [RGB]!
+ pld [RGB, #128]
+ .elseif \size == 4
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+ .elseif \size == 2
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+ .elseif \size == 1
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+ .else
+ .error unsupported macroblock size
+ .endif
+ .else
+ .error unsupported bpp
+ .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
+ vmull.u16 q7, d4, d0[0]
+ vmlal.u16 q7, d6, d0[1]
+ vmlal.u16 q7, d8, d0[2]
+ vmull.u16 q8, d5, d0[0]
+ vmlal.u16 q8, d7, d0[1]
+ vmlal.u16 q8, d9, d0[2]
+ vrev64.32 q9, q1
+ vrev64.32 q13, q1
+ vmlsl.u16 q9, d4, d0[3]
+ vmlsl.u16 q9, d6, d1[0]
+ vmlal.u16 q9, d8, d1[1]
+ vmlsl.u16 q13, d5, d0[3]
+ vmlsl.u16 q13, d7, d1[0]
+ vmlal.u16 q13, d9, d1[1]
+ vrev64.32 q14, q1
+ vrev64.32 q15, q1
+ vmlal.u16 q14, d4, d1[1]
+ vmlsl.u16 q14, d6, d1[2]
+ vmlsl.u16 q14, d8, d1[3]
+ vmlal.u16 q15, d5, d1[1]
+ vmlsl.u16 q15, d7, d1[2]
+ vmlsl.u16 q15, d9, d1[3]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+ vrshrn.u32 d20, q7, #16
+ vrshrn.u32 d21, q8, #16
+ vshrn.u32 d22, q9, #16
+ vshrn.u32 d23, q13, #16
+ vshrn.u32 d24, q14, #16
+ vshrn.u32 d25, q15, #16
+ vmovn.u16 d20, q10 /* d20 = y */
+ vmovn.u16 d21, q11 /* d21 = u */
+ vmovn.u16 d22, q12 /* d22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+ do_rgb_to_yuv_stage1
+ do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+ vrshrn.u32 d20, q7, #16
+ vrshrn.u32 d21, q8, #16
+ vshrn.u32 d22, q9, #16
+ vrev64.32 q9, q1
+ vshrn.u32 d23, q13, #16
+ vrev64.32 q13, q1
+ vshrn.u32 d24, q14, #16
+ vshrn.u32 d25, q15, #16
+ do_load \bpp, 8
+ vmovn.u16 d20, q10 /* d20 = y */
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
+ vmovn.u16 d21, q11 /* d21 = u */
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
+ vmovn.u16 d22, q12 /* d22 = v */
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
+ vmull.u16 q7, d4, d0[0]
+ vmlal.u16 q7, d6, d0[1]
+ vmlal.u16 q7, d8, d0[2]
+ vst1.8 {d20}, [Y]!
+ vmull.u16 q8, d5, d0[0]
+ vmlal.u16 q8, d7, d0[1]
+ vmlal.u16 q8, d9, d0[2]
+ vmlsl.u16 q9, d4, d0[3]
+ vmlsl.u16 q9, d6, d1[0]
+ vmlal.u16 q9, d8, d1[1]
+ vst1.8 {d21}, [U]!
+ vmlsl.u16 q13, d5, d0[3]
+ vmlsl.u16 q13, d7, d1[0]
+ vmlal.u16 q13, d9, d1[1]
+ vrev64.32 q14, q1
+ vrev64.32 q15, q1
+ vmlal.u16 q14, d4, d1[1]
+ vmlsl.u16 q14, d6, d1[2]
+ vmlsl.u16 q14, d8, d1[3]
+ vst1.8 {d22}, [V]!
+ vmlal.u16 q15, d5, d1[1]
+ vmlsl.u16 q15, d7, d1[2]
+ vmlsl.u16 q15, d9, d1[3]
+.endm
+
+.balign 16
+jsimd_\colorid\()_ycc_neon_consts:
+ .short 19595, 38470, 7471, 11059
+ .short 21709, 32768, 27439, 5329
+ .short 32767, 128, 32767, 128
+ .short 32767, 128, 32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+ OUTPUT_WIDTH .req r0
+ INPUT_BUF .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_ROW .req r3
+ NUM_ROWS .req r4
+
+ OUTPUT_BUF0 .req r5
+ OUTPUT_BUF1 .req r6
+ OUTPUT_BUF2 .req OUTPUT_BUF
+
+ RGB .req r7
+ Y .req r8
+ U .req r9
+ V .req r10
+ N .req ip
+
+ /* Load constants to d0, d1, d2, d3 */
+ adr ip, jsimd_\colorid\()_ycc_neon_consts
+ vld1.16 {d0, d1, d2, d3}, [ip, :128]
+
+ /* Save Arm registers and handle input arguments */
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
+ ldr NUM_ROWS, [sp, #(4 * 8)]
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
+ .unreq OUTPUT_BUF
+
+ /* Save Neon registers */
+ vpush {d8 - d15}
+
+ /* Outer loop over scanlines */
+ cmp NUM_ROWS, #1
+ blt 9f
+0:
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
+ mov N, OUTPUT_WIDTH
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
+ add OUTPUT_ROW, OUTPUT_ROW, #1
+ ldr RGB, [INPUT_BUF], #4
+
+ /* Inner loop over pixels */
+ subs N, N, #8
+ blt 3f
+ do_load \bpp, 8
+ do_rgb_to_yuv_stage1
+ subs N, N, #8
+ blt 2f
+1:
+ do_rgb_to_yuv_stage2_store_load_stage1
+ subs N, N, #8
+ bge 1b
+2:
+ do_rgb_to_yuv_stage2
+ do_store 8
+ tst N, #7
+ beq 8f
+3:
+ tst N, #4
+ beq 3f
+ do_load \bpp, 4
+3:
+ tst N, #2
+ beq 4f
+ do_load \bpp, 2
+4:
+ tst N, #1
+ beq 5f
+ do_load \bpp, 1
+5:
+ do_rgb_to_yuv
+ tst N, #4
+ beq 6f
+ do_store 4
+6:
+ tst N, #2
+ beq 7f
+ do_store 2
+7:
+ tst N, #1
+ beq 8f
+ do_store 1
+8:
+ subs NUM_ROWS, NUM_ROWS, #1
+ bgt 0b
+9:
+ /* Restore all registers and return */
+ vpop {d8 - d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
+
+ .unreq OUTPUT_WIDTH
+ .unreq OUTPUT_ROW
+ .unreq INPUT_BUF
+ .unreq NUM_ROWS
+ .unreq OUTPUT_BUF0
+ .unreq OUTPUT_BUF1
+ .unreq OUTPUT_BUF2
+ .unreq RGB
+ .unreq Y
+ .unreq U
+ .unreq V
+ .unreq N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R G B */
+generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
diff --git a/media/libjpeg/simd/arm/aarch64/jccolext-neon.c b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
new file mode 100644
index 0000000000..37130c225e
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb, and Cr output data */
+ JSAMPROW outptr0, outptr1, outptr2;
+ /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+ /* Set up conversion constants. */
+ const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+ cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+ cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ /* Increment pointers. */
+ inptr += (16 * RGB_PIXELSIZE);
+ outptr0 += 16;
+ outptr1 += 16;
+ outptr2 += 16;
+ }
+
+ if (cols_remaining > 8) {
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 16) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+ cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+ cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ } else if (cols_remaining > 0) {
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 8) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+ uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+ y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+ y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_l = scaled_128_5;
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+ cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+ uint32x4_t cb_h = scaled_128_5;
+ cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+ cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+ cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_l = scaled_128_5;
+ cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+ uint32x4_t cr_h = scaled_128_5;
+ cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+ cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+ cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+ vrshrn_n_u32(y_h, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+ vshrn_n_u32(cb_h, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+ vshrn_n_u32(cr_h, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jchuff-neon.c b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 0000000000..607a116070
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,411 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+ 0, 1, 2, 3, 16, 17, 32, 33,
+ 18, 19, 4, 5, 6, 7, 20, 21,
+ 34, 35, 48, 49, 255, 255, 50, 51,
+ 36, 37, 22, 23, 8, 9, 10, 11,
+ 255, 255, 6, 7, 20, 21, 34, 35,
+ 48, 49, 255, 255, 50, 51, 36, 37,
+ 54, 55, 40, 41, 26, 27, 12, 13,
+ 14, 15, 28, 29, 42, 43, 56, 57,
+ 6, 7, 20, 21, 34, 35, 48, 49,
+ 50, 51, 36, 37, 22, 23, 8, 9,
+ 26, 27, 12, 13, 255, 255, 14, 15,
+ 28, 29, 42, 43, 56, 57, 255, 255,
+ 52, 53, 54, 55, 40, 41, 26, 27,
+ 12, 13, 255, 255, 14, 15, 28, 29,
+ 26, 27, 40, 41, 42, 43, 28, 29,
+ 14, 15, 30, 31, 44, 45, 46, 47
+};
+
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address. That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ uint16_t block_diff[DCTSIZE2];
+
+ /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+ const uint8x16x4_t idx_rows_0123 =
+ vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+ const uint8x16x4_t idx_rows_4567 =
+ vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+ /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+ const uint8x16x4_t idx_rows_0123 = { {
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+ } };
+ const uint8x16x4_t idx_rows_4567 = { {
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+ } };
+#endif
+
+ /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+ const int8x16x4_t tbl_rows_0123 =
+ vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+ const int8x16x4_t tbl_rows_4567 =
+ vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+ const int8x16x4_t tbl_rows_0123 = { {
+ vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+ } };
+ const int8x16x4_t tbl_rows_4567 = { {
+ vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+ } };
+#endif
+
+ /* Initialise extra lookup tables. */
+ const int8x16x4_t tbl_rows_2345 = { {
+ tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+ tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+ } };
+ const int8x16x3_t tbl_rows_567 =
+ { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+ /* Shuffle coefficients into zig-zag order. */
+ int16x8_t row0 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+ int16x8_t row1 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+ int16x8_t row2 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+ int16x8_t row3 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+ int16x8_t row4 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+ int16x8_t row5 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+ int16x8_t row6 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+ int16x8_t row7 =
+ vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+ /* Compute DC coefficient difference value (F.1.1.5.1). */
+ row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+ /* Initialize AC coefficient lanes not reachable by lookup tables. */
+ row1 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+ 0), row1, 2);
+ row2 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+ 4), row2, 0);
+ row2 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+ 0), row2, 5);
+ row5 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+ 7), row5, 2);
+ row5 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+ 3), row5, 7);
+ row6 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+ 7), row6, 5);
+
+ /* DCT block is now in zig-zag order; start Huffman encoding process. */
+
+ /* Construct bitmap to accelerate encoding of AC coefficients. A set bit
+ * means that the corresponding coefficient != 0.
+ */
+ uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+ uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+ uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+ uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+ uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+ uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+ uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+ uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+ uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+ vreinterpretq_u8_u16(row0_ne_0));
+ uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+ vreinterpretq_u8_u16(row2_ne_0));
+ uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+ vreinterpretq_u8_u16(row4_ne_0));
+ uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+ vreinterpretq_u8_u16(row6_ne_0));
+
+ /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+ const uint8x16_t bitmap_mask =
+ vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
+
+ uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
+
+ uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+ uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+ uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+ bitmap_rows_3210);
+ uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+ vget_high_u8(bitmap_rows_76543210));
+
+ /* Shift left to remove DC bit. */
+ bitmap_all =
+ vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+ /* Count bits set (number of non-zero coefficients) in bitmap. */
+ unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+ /* Move bitmap to 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+ /* Set up state and bit buffer for output bitstream. */
+ working_state *state_ptr = (working_state *)state;
+ int free_bits = state_ptr->cur.free_bits;
+ size_t put_buffer = state_ptr->cur.put_buffer;
+
+ /* Encode DC coefficient. */
+
+ /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+ int16x8_t abs_row0 = vabsq_s16(row0);
+ int16x8_t row0_lz = vclzq_s16(abs_row0);
+ uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+ uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+ /* Find nbits required to specify sign and amplitude of coefficient. */
+ unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+ unsigned int nbits = 16 - lz;
+ /* Emit Huffman-coded symbol and additional diff bits. */
+ unsigned int diff = vgetq_lane_u16(row0_diff, 0);
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+ /* Encode AC coefficients. */
+
+ unsigned int r = 0; /* r = run length of zeros */
+ unsigned int i = 1; /* i = number of coefficients encoded */
+ /* Code and size information for a run length of 16 zero coefficients */
+ const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+ const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+ /* The most efficient method of computing nbits and diff depends on the
+ * number of non-zero coefficients. If the bitmap is not too sparse (> 8
+ * non-zero AC coefficients), it is beneficial to do all of the work using
+ * Neon; else we do some of the work using Neon and the rest on demand using
+ * scalar code.
+ */
+ if (non_zero_coefficients > 8) {
+ uint8_t block_nbits[DCTSIZE2];
+
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+ int16x8_t row1_lz = vclzq_s16(abs_row1);
+ int16x8_t row2_lz = vclzq_s16(abs_row2);
+ int16x8_t row3_lz = vclzq_s16(abs_row3);
+ int16x8_t row4_lz = vclzq_s16(abs_row4);
+ int16x8_t row5_lz = vclzq_s16(abs_row5);
+ int16x8_t row6_lz = vclzq_s16(abs_row6);
+ int16x8_t row7_lz = vclzq_s16(abs_row7);
+ /* Narrow leading zero count to 8 bits. */
+ uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+ vreinterpretq_u8_s16(row1_lz));
+ uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+ vreinterpretq_u8_s16(row3_lz));
+ uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+ vreinterpretq_u8_s16(row5_lz));
+ uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+ vreinterpretq_u8_s16(row7_lz));
+ /* Compute nbits needed to specify magnitude of each coefficient. */
+ uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+ uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+ uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+ uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
+ /* Store nbits. */
+ vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+ vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+ vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+ vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
+ /* Mask bits not required to specify sign and amplitude of diff. */
+ uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+ uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+ uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+ uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+ uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+ uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+ uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+ /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+ row1_mask);
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+ row2_mask);
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+ row3_mask);
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+ row4_mask);
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+ row5_mask);
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+ row6_mask);
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+ row7_mask);
+ /* Store diff bits. */
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ while (bitmap != 0) {
+ r = BUILTIN_CLZLL(bitmap);
+ i += r;
+ bitmap <<= r;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap <<= 1;
+ }
+ } else if (bitmap != 0) {
+ uint16_t block_abs[DCTSIZE2];
+ /* Compute and store absolute value of coefficients. */
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+ vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+ vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+ vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+ vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+ vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+ vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+ vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+ vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+ /* Compute diff bits (without nbits mask) and store. */
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+ vcltzq_s16(row1));
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+ vcltzq_s16(row2));
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+ vcltzq_s16(row3));
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+ vcltzq_s16(row4));
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+ vcltzq_s16(row5));
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+ vcltzq_s16(row6));
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+ vcltzq_s16(row7));
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ /* Same as above but must mask diff bits and compute nbits on demand. */
+ while (bitmap != 0) {
+ r = BUILTIN_CLZLL(bitmap);
+ i += r;
+ bitmap <<= r;
+ lz = BUILTIN_CLZ(block_abs[i]);
+ nbits = 32 - lz;
+ diff = ((unsigned int)block_diff[i] << lz) >> lz;
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap <<= 1;
+ }
+ }
+
+ /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+ * The value of RS for the EOB code is 0.
+ */
+ if (i != 64) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+
+ state_ptr->cur.put_buffer = put_buffer;
+ state_ptr->cur.free_bits = free_bits;
+
+ return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jsimd.c b/media/libjpeg/simd/arm/aarch64/jsimd.c
new file mode 100644
index 0000000000..604d5472f6
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jsimd.c
@@ -0,0 +1,1058 @@
+/*
+ * jsimd_arm64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "jconfigint.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#define JSIMD_FASTLD3 1
+#define JSIMD_FASTST3 2
+#define JSIMD_FASTTBL 4
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+ JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo(char *buffer, const char *field, char *value)
+{
+ char *p;
+
+ if (*value == 0)
+ return 0;
+ if (strncmp(buffer, field, strlen(field)) != 0)
+ return 0;
+ buffer += strlen(field);
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'value' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, value))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(value);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+ check_cpuinfo(buffer, "CPU part", "0xd07"))
+ /* The Cortex-A53 has a slow tbl implementation. We can gain a few
+ percent speedup by disabling the use of that instruction. The
+ speedup on Cortex-A57 is more subtle but still measurable. */
+ simd_features &= ~JSIMD_FASTTBL;
+ else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+ /* The SIMD version of Huffman encoding is slower than the C version on
+ Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that
+ CPU. */
+ simd_huffman = simd_features = 0;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+
+/*
+ * Armv8 architectures support Neon extensions by default.
+ * It is no longer optional as it was with Armv7.
+ */
+
+
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+ simd_support |= JSIMD_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+ simd_support = JSIMD_NEON;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+ if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1"))
+ simd_features |= JSIMD_FASTLD3;
+ if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0"))
+ simd_features &= ~JSIMD_FASTLD3;
+ if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1"))
+ simd_features |= JSIMD_FASTST3;
+ if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0"))
+ simd_features &= ~JSIMD_FASTST3;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_ycc_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_ycc_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_ycc_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_ycc_convert_neon;
+ break;
+ default:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_gray_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_gray_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_gray_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_ycc_extrgbx_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_ycc_extbgrx_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_ycc_extxbgr_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_ycc_extxrgb_convert_neon;
+ break;
+ default:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+ output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON && simd_huffman)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTTBL)
+#endif
+ return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+#ifndef NEON_INTRINSICS
+ else
+ return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+ last_dc_val, dctbl, actbl);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/jsimd_arm64_neon.S b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
index 3309858241..738a4f0658 100644
--- a/media/libjpeg/simd/jsimd_arm64_neon.S
+++ b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
@@ -1,13 +1,13 @@
/*
- * ARMv8 NEON optimizations for libjpeg-turbo
+ * Armv8 Neon optimizations for libjpeg-turbo
*
* Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * All Rights Reserved.
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
* Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
- * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
- * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved.
+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
* Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
@@ -31,10 +31,158 @@
.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
#endif
-.text
+#if defined(__APPLE__)
+.section __DATA, __const
+#elif defined(_WIN32)
+.section .rdata
+#else
+.section .rodata, "a", %progbits
+#endif
+/* Constants for jsimd_idct_islow_neon() */
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_idct_islow_neon_consts:
+ .short F_0_298
+ .short -F_0_390
+ .short F_0_541
+ .short F_0_765
+ .short - F_0_899
+ .short F_1_175
+ .short F_1_501
+ .short - F_1_847
+ .short - F_1_961
+ .short F_2_053
+ .short - F_2_562
+ .short F_3_072
+ .short 0 /* padding */
+ .short 0
+ .short 0
+ .short 0
-#define RESPECT_STRICT_ALIGNMENT 1
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_ycc_*_neon() */
+
+.balign 16
+Ljsimd_ycc_rgb_neon_consts:
+ .short 0, 0, 0, 0
+ .short 22971, -11277, -23401, 29033
+ .short -128, -128, -128, -128
+ .short -128, -128, -128, -128
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+ .short 19595, 38470, 7471, 11059
+ .short 21709, 32768, 27439, 5329
+ .short 32767, 128, 32767, 128
+ .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+ .short F_0_298
+ .short -F_0_390
+ .short F_0_541
+ .short F_0_765
+ .short - F_0_899
+ .short F_1_175
+ .short F_1_501
+ .short - F_1_847
+ .short - F_1_961
+ .short F_2_053
+ .short - F_2_562
+ .short F_3_072
+ .short 0 /* padding */
+ .short 0
+ .short 0
+ .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+ .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+ .byte 0, 1, 2, 3, 16, 17, 32, 33, \
+ 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
+ .byte 34, 35, 48, 49, 255, 255, 50, 51, \
+ 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
+ .byte 8, 9, 22, 23, 36, 37, 50, 51, \
+ 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
+ .byte 54, 55, 40, 41, 26, 27, 12, 13, \
+ 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
+ .byte 6, 7, 20, 21, 34, 35, 48, 49, \
+ 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
+ .byte 42, 43, 28, 29, 14, 15, 30, 31, \
+ 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
+ .byte 255, 255, 255, 255, 56, 57, 42, 43, \
+ 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
+ .byte 26, 27, 40, 41, 42, 43, 28, 29, \
+ 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
+ .byte 255, 255, 255, 255, 0, 1, 255, 255, \
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
+ 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
+ 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
+ .byte 4, 5, 6, 7, 255, 255, 255, 255, \
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
+
+.text
/*****************************************************************************/
@@ -42,6 +190,7 @@
/* Supplementary macro for setting function attributes */
.macro asm_function fname
#ifdef __APPLE__
+ .private_extern _\fname
.globl _\fname
_\fname:
#else
@@ -54,43 +203,15 @@ _\fname:
#endif
.endm
-/* Transpose elements of single 128 bit registers */
-.macro transpose_single x0, x1, xi, xilen, literal
- ins \xi\xilen[0], \x0\xilen[0]
- ins \x1\xilen[0], \x0\xilen[1]
- trn1 \x0\literal, \x0\literal, \x1\literal
- trn2 \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose elements of 2 differnet registers */
-.macro transpose x0, x1, xi, xilen, literal
- mov \xi\xilen, \x0\xilen
- trn1 \x0\literal, \x0\literal, \x1\literal
- trn2 \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
- mov \xi\xilen, \x0\xilen
- trn1 \x0\x0len, \x0\x0len, \x2\x2len
- trn2 \x2\x2len, \xi\x0len, \x2\x2len
- mov \xi\xilen, \x1\xilen
- trn1 \x1\x1len, \x1\x1len, \x3\x3len
- trn2 \x3\x3len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
- mov \xi\xilen, \x0\xilen
- trn1 \x0\x0len, \x0\x0len, \x1\x1len
- trn2 \x1\x2len, \xi\x0len, \x1\x2len
- mov \xi\xilen, \x2\xilen
- trn1 \x2\x2len, \x2\x2len, \x3\x3len
- trn2 \x3\x2len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4 x0, x1, x2, x3, x5
- transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
- transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+ adrp \reg, \symbol@PAGE
+ add \reg, \reg, \symbol@PAGEOFF
+#else
+ adrp \reg, \symbol
+ add \reg, \reg, :lo12:\symbol
+#endif
.endm
.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
@@ -123,7 +244,7 @@ _\fname:
.endm
-#define CENTERJSAMPLE 128
+#define CENTERJSAMPLE 128
/*****************************************************************************/
@@ -131,70 +252,25 @@ _\fname:
* Perform dequantization and inverse DCT on one block of coefficients.
*
* GLOBAL(void)
- * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
- * JSAMPARRAY output_buf, JDIMENSION output_col)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
*/
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_idct_islow_neon_consts:
- .short F_0_298
- .short -F_0_390
- .short F_0_541
- .short F_0_765
- .short - F_0_899
- .short F_1_175
- .short F_1_501
- .short - F_1_847
- .short - F_1_961
- .short F_2_053
- .short - F_2_562
- .short F_3_072
- .short 0 /* padding */
- .short 0
- .short 0
- .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
asm_function jsimd_idct_islow_neon
DCT_TABLE .req x0
@@ -216,7 +292,7 @@ asm_function jsimd_idct_islow_neon
uxtw x3, w3
sub sp, sp, #64
- adr x15, Ljsimd_idct_islow_neon_consts
+ get_symbol_loc x15, Ljsimd_idct_islow_neon_consts
mov x10, sp
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
@@ -292,8 +368,8 @@ asm_function jsimd_idct_islow_neon
sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
mov v21.16b, v19.16b /* tmp3 = z1 */
mov v20.16b, v18.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
@@ -323,20 +399,20 @@ asm_function jsimd_idct_islow_neon
smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
add v23.4s, v23.4s, v27.4s /* z3 += z5 */
add v22.4s, v22.4s, v26.4s /* z3 += z5 */
@@ -380,40 +456,40 @@ asm_function jsimd_idct_islow_neon
sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
- shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
movi v0.16b, #(CENTERJSAMPLE)
- /* Prepare pointers (dual-issue with NEON instructions) */
+ /* Prepare pointers (dual-issue with Neon instructions) */
ldp TMP1, TMP2, [OUTPUT_BUF], 16
- sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
+ sqrshrn v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
ldp TMP3, TMP4, [OUTPUT_BUF], 16
- sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
+ sqrshrn v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP1, TMP1, OUTPUT_COL
- sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
+ sqrshrn v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP2, TMP2, OUTPUT_COL
- sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
+ sqrshrn v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP3, TMP3, OUTPUT_COL
- sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
+ sqrshrn2 v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP4, TMP4, OUTPUT_COL
- sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
+ sqrshrn2 v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
ldp TMP5, TMP6, [OUTPUT_BUF], 16
- sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
+ sqrshrn2 v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
ldp TMP7, TMP8, [OUTPUT_BUF], 16
- sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
+ sqrshrn2 v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
add TMP5, TMP5, OUTPUT_COL
add v16.16b, v28.16b, v0.16b
add TMP6, TMP6, OUTPUT_COL
@@ -474,7 +550,7 @@ asm_function jsimd_idct_islow_neon
sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
mov v20.16b, v18.16b /* tmp3 = z1 */
sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
@@ -496,10 +572,10 @@ asm_function jsimd_idct_islow_neon
smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
add v22.4s, v22.4s, v26.4s /* z3 += z5 */
add v24.4s, v24.4s, v26.4s /* z4 += z5 */
@@ -525,14 +601,14 @@ asm_function jsimd_idct_islow_neon
add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
- rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
mov v6.16b, v15.16b
mov v7.16b, v15.16b
mov v8.16b, v15.16b
@@ -551,7 +627,7 @@ asm_function jsimd_idct_islow_neon
sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
mov v21.16b, v19.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
@@ -574,10 +650,10 @@ asm_function jsimd_idct_islow_neon
smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
add v23.4s, v23.4s, v27.4s /* z3 += z5 */
add v22.4s, v22.4s, v26.4s /* z3 += z5 */
@@ -609,14 +685,14 @@ asm_function jsimd_idct_islow_neon
mov v3.16b, v14.16b
mov v4.16b, v14.16b
mov v5.16b, v14.16b
- rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
b 1b
.balign 16
@@ -631,8 +707,8 @@ asm_function jsimd_idct_islow_neon
sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
mov v21.16b, v19.16b /* tmp3 = z1 */
mov v20.16b, v18.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
@@ -662,20 +738,20 @@ asm_function jsimd_idct_islow_neon
smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
add v23.4s, v23.4s, v27.4s /* z3 += z5 */
add v22.4s, v22.4s, v26.4s /* z3 += z5 */
@@ -719,22 +795,22 @@ asm_function jsimd_idct_islow_neon
sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
- rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
b 1b
.unreq DCT_TABLE
@@ -770,665 +846,6 @@ asm_function jsimd_idct_islow_neon
/*****************************************************************************/
/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in ARM NEON case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200 v0.h[0]
-#define XFIX_1_414213562 v0.h[1]
-#define XFIX_1_847759065 v0.h[2]
-#define XFIX_2_613125930 v0.h[3]
-
-.balign 16
-Ljsimd_idct_ifast_neon_consts:
- .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
- .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
- .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
- .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
-
-asm_function jsimd_idct_ifast_neon
-
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x1
- TMP3 .req x9
- TMP4 .req x10
- TMP5 .req x11
- TMP6 .req x12
- TMP7 .req x13
- TMP8 .req x14
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- /* Load and dequantize coefficients into NEON registers
- * with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 ( v16.8h )
- * 1 | d18 | d19 ( v17.8h )
- * 2 | d20 | d21 ( v18.8h )
- * 3 | d22 | d23 ( v19.8h )
- * 4 | d24 | d25 ( v20.8h )
- * 5 | d26 | d27 ( v21.8h )
- * 6 | d28 | d29 ( v22.8h )
- * 7 | d30 | d31 ( v23.8h )
- */
- /* Save NEON registers used in fast IDCT */
- adr TMP5, Ljsimd_idct_ifast_neon_consts
- ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32
- ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
- ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32
- mul v16.8h, v16.8h, v0.8h
- ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
- mul v17.8h, v17.8h, v1.8h
- ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32
- mul v18.8h, v18.8h, v2.8h
- ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
- mul v19.8h, v19.8h, v3.8h
- ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32
- mul v20.8h, v20.8h, v0.8h
- ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
- mul v22.8h, v22.8h, v2.8h
- mul v21.8h, v21.8h, v1.8h
- ld1 {v0.4h}, [TMP5] /* load constants */
- mul v23.8h, v23.8h, v3.8h
-
- /* 1-D IDCT, pass 1 */
- sub v2.8h, v18.8h, v22.8h
- add v22.8h, v18.8h, v22.8h
- sub v1.8h, v19.8h, v21.8h
- add v21.8h, v19.8h, v21.8h
- sub v5.8h, v17.8h, v23.8h
- add v23.8h, v17.8h, v23.8h
- sqdmulh v4.8h, v2.8h, XFIX_1_414213562
- sqdmulh v6.8h, v1.8h, XFIX_2_613125930
- add v3.8h, v1.8h, v1.8h
- sub v1.8h, v5.8h, v1.8h
- add v18.8h, v2.8h, v4.8h
- sqdmulh v4.8h, v1.8h, XFIX_1_847759065
- sub v2.8h, v23.8h, v21.8h
- add v3.8h, v3.8h, v6.8h
- sqdmulh v6.8h, v2.8h, XFIX_1_414213562
- add v1.8h, v1.8h, v4.8h
- sqdmulh v4.8h, v5.8h, XFIX_1_082392200
- sub v18.8h, v18.8h, v22.8h
- add v2.8h, v2.8h, v6.8h
- sub v6.8h, v16.8h, v20.8h
- add v20.8h, v16.8h, v20.8h
- add v17.8h, v5.8h, v4.8h
- add v5.8h, v6.8h, v18.8h
- sub v18.8h, v6.8h, v18.8h
- add v6.8h, v23.8h, v21.8h
- add v16.8h, v20.8h, v22.8h
- sub v3.8h, v6.8h, v3.8h
- sub v20.8h, v20.8h, v22.8h
- sub v3.8h, v3.8h, v1.8h
- sub v1.8h, v17.8h, v1.8h
- add v2.8h, v3.8h, v2.8h
- sub v23.8h, v16.8h, v6.8h
- add v1.8h, v1.8h, v2.8h
- add v16.8h, v16.8h, v6.8h
- add v22.8h, v5.8h, v3.8h
- sub v17.8h, v5.8h, v3.8h
- sub v21.8h, v18.8h, v2.8h
- add v18.8h, v18.8h, v2.8h
- sub v19.8h, v20.8h, v1.8h
- add v20.8h, v20.8h, v1.8h
- transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
- /* 1-D IDCT, pass 2 */
- sub v2.8h, v18.8h, v22.8h
- add v22.8h, v18.8h, v22.8h
- sub v1.8h, v19.8h, v21.8h
- add v21.8h, v19.8h, v21.8h
- sub v5.8h, v17.8h, v23.8h
- add v23.8h, v17.8h, v23.8h
- sqdmulh v4.8h, v2.8h, XFIX_1_414213562
- sqdmulh v6.8h, v1.8h, XFIX_2_613125930
- add v3.8h, v1.8h, v1.8h
- sub v1.8h, v5.8h, v1.8h
- add v18.8h, v2.8h, v4.8h
- sqdmulh v4.8h, v1.8h, XFIX_1_847759065
- sub v2.8h, v23.8h, v21.8h
- add v3.8h, v3.8h, v6.8h
- sqdmulh v6.8h, v2.8h, XFIX_1_414213562
- add v1.8h, v1.8h, v4.8h
- sqdmulh v4.8h, v5.8h, XFIX_1_082392200
- sub v18.8h, v18.8h, v22.8h
- add v2.8h, v2.8h, v6.8h
- sub v6.8h, v16.8h, v20.8h
- add v20.8h, v16.8h, v20.8h
- add v17.8h, v5.8h, v4.8h
- add v5.8h, v6.8h, v18.8h
- sub v18.8h, v6.8h, v18.8h
- add v6.8h, v23.8h, v21.8h
- add v16.8h, v20.8h, v22.8h
- sub v3.8h, v6.8h, v3.8h
- sub v20.8h, v20.8h, v22.8h
- sub v3.8h, v3.8h, v1.8h
- sub v1.8h, v17.8h, v1.8h
- add v2.8h, v3.8h, v2.8h
- sub v23.8h, v16.8h, v6.8h
- add v1.8h, v1.8h, v2.8h
- add v16.8h, v16.8h, v6.8h
- add v22.8h, v5.8h, v3.8h
- sub v17.8h, v5.8h, v3.8h
- sub v21.8h, v18.8h, v2.8h
- add v18.8h, v18.8h, v2.8h
- sub v19.8h, v20.8h, v1.8h
- add v20.8h, v20.8h, v1.8h
- /* Descale to 8-bit and range limit */
- movi v0.16b, #0x80
- /* Prepare pointers (dual-issue with NEON instructions) */
- ldp TMP1, TMP2, [OUTPUT_BUF], 16
- sqshrn v28.8b, v16.8h, #5
- ldp TMP3, TMP4, [OUTPUT_BUF], 16
- sqshrn v29.8b, v17.8h, #5
- add TMP1, TMP1, OUTPUT_COL
- sqshrn v30.8b, v18.8h, #5
- add TMP2, TMP2, OUTPUT_COL
- sqshrn v31.8b, v19.8h, #5
- add TMP3, TMP3, OUTPUT_COL
- sqshrn2 v28.16b, v20.8h, #5
- add TMP4, TMP4, OUTPUT_COL
- sqshrn2 v29.16b, v21.8h, #5
- ldp TMP5, TMP6, [OUTPUT_BUF], 16
- sqshrn2 v30.16b, v22.8h, #5
- ldp TMP7, TMP8, [OUTPUT_BUF], 16
- sqshrn2 v31.16b, v23.8h, #5
- add TMP5, TMP5, OUTPUT_COL
- add v16.16b, v28.16b, v0.16b
- add TMP6, TMP6, OUTPUT_COL
- add v18.16b, v29.16b, v0.16b
- add TMP7, TMP7, OUTPUT_COL
- add v20.16b, v30.16b, v0.16b
- add TMP8, TMP8, OUTPUT_COL
- add v22.16b, v31.16b, v0.16b
-
- /* Transpose the final 8-bit samples */
- trn1 v28.16b, v16.16b, v18.16b
- trn1 v30.16b, v20.16b, v22.16b
- trn2 v29.16b, v16.16b, v18.16b
- trn2 v31.16b, v20.16b, v22.16b
-
- trn1 v16.8h, v28.8h, v30.8h
- trn2 v18.8h, v28.8h, v30.8h
- trn1 v20.8h, v29.8h, v31.8h
- trn2 v22.8h, v29.8h, v31.8h
-
- uzp1 v28.4s, v16.4s, v18.4s
- uzp2 v30.4s, v16.4s, v18.4s
- uzp1 v29.4s, v20.4s, v22.4s
- uzp2 v31.4s, v20.4s, v22.4s
-
- /* Store results to the output buffer */
- st1 {v28.d}[0], [TMP1]
- st1 {v29.d}[0], [TMP2]
- st1 {v28.d}[1], [TMP3]
- st1 {v29.d}[1], [TMP4]
- st1 {v30.d}[0], [TMP5]
- st1 {v31.d}[0], [TMP6]
- st1 {v30.d}[1], [TMP7]
- st1 {v31.d}[1], [TMP8]
- blr x30
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- * idct_helper/transpose_4x4 macros and reordering instructions,
- * but readability will suffer somewhat.
- */
-
-#define CONST_BITS 13
-
-#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
-#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
-#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
-#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
-#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
-#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
-#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
-#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
-#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
-
-.balign 16
-Ljsimd_idct_4x4_neon_consts:
- .short FIX_1_847759065 /* v0.h[0] */
- .short -FIX_0_765366865 /* v0.h[1] */
- .short -FIX_0_211164243 /* v0.h[2] */
- .short FIX_1_451774981 /* v0.h[3] */
- .short -FIX_2_172734803 /* d1[0] */
- .short FIX_1_061594337 /* d1[1] */
- .short -FIX_0_509795579 /* d1[2] */
- .short -FIX_0_601344887 /* d1[3] */
- .short FIX_0_899976223 /* v2.h[0] */
- .short FIX_2_562915447 /* v2.h[1] */
- .short 1 << (CONST_BITS+1) /* v2.h[2] */
- .short 0 /* v2.h[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
- smull v28.4s, \x4, v2.h[2]
- smlal v28.4s, \x8, v0.h[0]
- smlal v28.4s, \x14, v0.h[1]
-
- smull v26.4s, \x16, v1.h[2]
- smlal v26.4s, \x12, v1.h[3]
- smlal v26.4s, \x10, v2.h[0]
- smlal v26.4s, \x6, v2.h[1]
-
- smull v30.4s, \x4, v2.h[2]
- smlsl v30.4s, \x8, v0.h[0]
- smlsl v30.4s, \x14, v0.h[1]
-
- smull v24.4s, \x16, v0.h[2]
- smlal v24.4s, \x12, v0.h[3]
- smlal v24.4s, \x10, v1.h[0]
- smlal v24.4s, \x6, v1.h[1]
-
- add v20.4s, v28.4s, v26.4s
- sub v28.4s, v28.4s, v26.4s
-
- .if \shift > 16
- srshr v20.4s, v20.4s, #\shift
- srshr v28.4s, v28.4s, #\shift
- xtn \y26, v20.4s
- xtn \y29, v28.4s
- .else
- rshrn \y26, v20.4s, #\shift
- rshrn \y29, v28.4s, #\shift
- .endif
-
- add v20.4s, v30.4s, v24.4s
- sub v30.4s, v30.4s, v24.4s
-
- .if \shift > 16
- srshr v20.4s, v20.4s, #\shift
- srshr v30.4s, v30.4s, #\shift
- xtn \y27, v20.4s
- xtn \y28, v30.4s
- .else
- rshrn \y27, v20.4s, #\shift
- rshrn \y28, v30.4s, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x1
- TMP3 .req x2
- TMP4 .req x15
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- /* Save all used NEON registers */
- sub sp, sp, 64
- mov x9, sp
- /* Load constants (v3.4h is just used for padding) */
- adr TMP4, Ljsimd_idct_4x4_neon_consts
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
- ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | v4.4h | v5.4h
- * 1 | v6.4h | v7.4h
- * 2 | v8.4h | v9.4h
- * 3 | v10.4h | v11.4h
- * 4 | - | -
- * 5 | v12.4h | v13.4h
- * 6 | v14.4h | v15.4h
- * 7 | v16.4h | v17.4h
- */
- ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
- ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
- ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
- /* dequantize */
- ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
- mul v4.4h, v4.4h, v18.4h
- mul v5.4h, v5.4h, v19.4h
- ins v4.d[1], v5.d[0] /* 128 bit q4 */
- ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
- mul v6.4h, v6.4h, v20.4h
- mul v7.4h, v7.4h, v21.4h
- ins v6.d[1], v7.d[0] /* 128 bit q6 */
- mul v8.4h, v8.4h, v22.4h
- mul v9.4h, v9.4h, v23.4h
- ins v8.d[1], v9.d[0] /* 128 bit q8 */
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
- mul v10.4h, v10.4h, v24.4h
- mul v11.4h, v11.4h, v25.4h
- ins v10.d[1], v11.d[0] /* 128 bit q10 */
- mul v12.4h, v12.4h, v26.4h
- mul v13.4h, v13.4h, v27.4h
- ins v12.d[1], v13.d[0] /* 128 bit q12 */
- ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
- mul v14.4h, v14.4h, v28.4h
- mul v15.4h, v15.4h, v29.4h
- ins v14.d[1], v15.d[0] /* 128 bit q14 */
- mul v16.4h, v16.4h, v30.4h
- mul v17.4h, v17.4h, v31.4h
- ins v16.d[1], v17.d[0] /* 128 bit q16 */
-
- /* Pass 1 */
- idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
- v4.4h, v6.4h, v8.4h, v10.4h
- transpose_4x4 v4, v6, v8, v10, v3
- ins v10.d[1], v11.d[0]
- idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
- v5.4h, v7.4h, v9.4h, v11.4h
- transpose_4x4 v5, v7, v9, v11, v3
- ins v10.d[1], v11.d[0]
-
- /* Pass 2 */
- idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
- v26.4h, v27.4h, v28.4h, v29.4h
- transpose_4x4 v26, v27, v28, v29, v3
-
- /* Range limit */
- movi v30.8h, #0x80
- ins v26.d[1], v27.d[0]
- ins v28.d[1], v29.d[0]
- add v26.8h, v26.8h, v30.8h
- add v28.8h, v28.8h, v30.8h
- sqxtun v26.8b, v26.8h
- sqxtun v27.8b, v28.8h
-
- /* Store results to the output buffer */
- ldp TMP1, TMP2, [OUTPUT_BUF], 16
- ldp TMP3, TMP4, [OUTPUT_BUF]
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
- /* We can use much less instructions on little endian systems if the
- * OS kernel is not configured to trap unaligned memory accesses
- */
- st1 {v26.s}[0], [TMP1], 4
- st1 {v27.s}[0], [TMP3], 4
- st1 {v26.s}[1], [TMP2], 4
- st1 {v27.s}[1], [TMP4], 4
-#else
- st1 {v26.b}[0], [TMP1], 1
- st1 {v27.b}[0], [TMP3], 1
- st1 {v26.b}[1], [TMP1], 1
- st1 {v27.b}[1], [TMP3], 1
- st1 {v26.b}[2], [TMP1], 1
- st1 {v27.b}[2], [TMP3], 1
- st1 {v26.b}[3], [TMP1], 1
- st1 {v27.b}[3], [TMP3], 1
-
- st1 {v26.b}[4], [TMP2], 1
- st1 {v27.b}[4], [TMP4], 1
- st1 {v26.b}[5], [TMP2], 1
- st1 {v27.b}[5], [TMP4], 1
- st1 {v26.b}[6], [TMP2], 1
- st1 {v27.b}[6], [TMP4], 1
- st1 {v26.b}[7], [TMP2], 1
- st1 {v27.b}[7], [TMP4], 1
-#endif
-
- /* vpop {v8.4h - v15.4h} ;not available */
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- blr x30
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
- .short -FIX_0_720959822 /* v14[0] */
- .short FIX_0_850430095 /* v14[1] */
- .short -FIX_1_272758580 /* v14[2] */
- .short FIX_3_624509785 /* v14[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
- sshll v15.4s, \x4, #15
- smull v26.4s, \x6, v14.h[3]
- smlal v26.4s, \x10, v14.h[2]
- smlal v26.4s, \x12, v14.h[1]
- smlal v26.4s, \x16, v14.h[0]
-
- add v20.4s, v15.4s, v26.4s
- sub v15.4s, v15.4s, v26.4s
-
- .if \shift > 16
- srshr v20.4s, v20.4s, #\shift
- srshr v15.4s, v15.4s, #\shift
- xtn \y26, v20.4s
- xtn \y27, v15.4s
- .else
- rshrn \y26, v20.4s, #\shift
- rshrn \y27, v15.4s, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x15
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- /* vpush {v8.4h - v15.4h} ; not available */
- sub sp, sp, 64
- mov x9, sp
-
- /* Load constants */
- adr TMP2, Ljsimd_idct_2x2_neon_consts
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
- ld1 {v14.4h}, [TMP2]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | v4.4h | v5.4h
- * 1 | v6.4h | v7.4h
- * 2 | - | -
- * 3 | v10.4h | v11.4h
- * 4 | - | -
- * 5 | v12.4h | v13.4h
- * 6 | - | -
- * 7 | v16.4h | v17.4h
- */
- ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
- /* Dequantize */
- ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
- mul v4.4h, v4.4h, v18.4h
- mul v5.4h, v5.4h, v19.4h
- ins v4.d[1], v5.d[0]
- mul v6.4h, v6.4h, v20.4h
- mul v7.4h, v7.4h, v21.4h
- ins v6.d[1], v7.d[0]
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
- mul v10.4h, v10.4h, v24.4h
- mul v11.4h, v11.4h, v25.4h
- ins v10.d[1], v11.d[0]
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
- mul v12.4h, v12.4h, v26.4h
- mul v13.4h, v13.4h, v27.4h
- ins v12.d[1], v13.d[0]
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
- mul v16.4h, v16.4h, v30.4h
- mul v17.4h, v17.4h, v31.4h
- ins v16.d[1], v17.d[0]
-
- /* Pass 1 */
-#if 0
- idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
- transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
- idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
- transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
-#else
- smull v26.4s, v6.4h, v14.h[3]
- smlal v26.4s, v10.4h, v14.h[2]
- smlal v26.4s, v12.4h, v14.h[1]
- smlal v26.4s, v16.4h, v14.h[0]
- smull v24.4s, v7.4h, v14.h[3]
- smlal v24.4s, v11.4h, v14.h[2]
- smlal v24.4s, v13.4h, v14.h[1]
- smlal v24.4s, v17.4h, v14.h[0]
- sshll v15.4s, v4.4h, #15
- sshll v30.4s, v5.4h, #15
- add v20.4s, v15.4s, v26.4s
- sub v15.4s, v15.4s, v26.4s
- rshrn v4.4h, v20.4s, #13
- rshrn v6.4h, v15.4s, #13
- add v20.4s, v30.4s, v24.4s
- sub v15.4s, v30.4s, v24.4s
- rshrn v5.4h, v20.4s, #13
- rshrn v7.4h, v15.4s, #13
- ins v4.d[1], v5.d[0]
- ins v6.d[1], v7.d[0]
- transpose v4, v6, v3, .16b, .8h
- transpose v6, v10, v3, .16b, .4s
- ins v11.d[0], v10.d[1]
- ins v7.d[0], v6.d[1]
-#endif
-
- /* Pass 2 */
- idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
-
- /* Range limit */
- movi v30.8h, #0x80
- ins v26.d[1], v27.d[0]
- add v26.8h, v26.8h, v30.8h
- sqxtun v30.8b, v26.8h
- ins v26.d[0], v30.d[0]
- sqxtun v27.8b, v26.8h
-
- /* Store results to the output buffer */
- ldp TMP1, TMP2, [OUTPUT_BUF]
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
-
- st1 {v26.b}[0], [TMP1], 1
- st1 {v27.b}[4], [TMP1], 1
- st1 {v26.b}[1], [TMP2], 1
- st1 {v27.b}[5], [TMP2], 1
-
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- blr x30
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
* jsimd_ycc_extrgb_convert_neon
* jsimd_ycc_extbgr_convert_neon
* jsimd_ycc_extrgbx_convert_neon
@@ -1543,7 +960,7 @@ asm_function jsimd_idct_2x2_neon
.else
.error unsupported macroblock size
.endif
- .elseif \bpp==16
+ .elseif \bpp == 16
.if \size == 8
st1 {v25.8h}, [RGB], 16
.elseif \size == 4
@@ -1662,21 +1079,6 @@ asm_function jsimd_idct_2x2_neon
do_yuv_to_rgb_stage2
.endm
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-.if \fast_st3 == 1
-Ljsimd_ycc_\colorid\()_neon_consts:
-.else
-Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
-.endif
- .short 0, 0, 0, 0
- .short 22971, -11277, -23401, 29033
- .short -128, -128, -128, -128
- .short -128, -128, -128, -128
-
.if \fast_st3 == 1
asm_function jsimd_ycc_\colorid\()_convert_neon
.else
@@ -1702,13 +1104,9 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
mov x9, sp
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
- .if \fast_st3 == 1
- adr x15, Ljsimd_ycc_\colorid\()_neon_consts
- .else
- adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
- .endif
+ get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts
- /* Save NEON registers */
+ /* Save Neon registers */
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
ld1 {v0.4h, v1.4h}, [x15], 16
@@ -1993,7 +1391,7 @@ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b,
.endm
/* TODO: expand macros and interleave instructions if some in-order
- * ARM64 processor actually can dual-issue LOAD/STORE with ALU */
+ * AArch64 processor actually can dual-issue LOAD/STORE with ALU */
.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
do_rgb_to_yuv_stage2
do_load \bpp, 8, \fast_ld3
@@ -2003,17 +1401,6 @@ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b,
do_rgb_to_yuv_stage1
.endm
-.balign 16
-.if \fast_ld3 == 1
-Ljsimd_\colorid\()_ycc_neon_consts:
-.else
-Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
-.endif
- .short 19595, 38470, 7471, 11059
- .short 21709, 32768, 27439, 5329
- .short 32767, 128, 32767, 128
- .short 32767, 128, 32767, 128
-
.if \fast_ld3 == 1
asm_function jsimd_\colorid\()_ycc_convert_neon
.else
@@ -2036,11 +1423,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
N .req w12
/* Load constants to d0, d1, d2, d3 */
- .if \fast_ld3 == 1
- adr x13, Ljsimd_\colorid\()_ycc_neon_consts
- .else
- adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
- .endif
+ get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts
ld1 {v0.8h, v1.8h}, [x13]
ldr OUTPUT_BUF0, [OUTPUT_BUF]
@@ -2048,7 +1431,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
.unreq OUTPUT_BUF
- /* Save NEON registers */
+ /* Save Neon registers */
sub sp, sp, #64
mov x9, sp
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
@@ -2147,85 +1530,9 @@ generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
/*****************************************************************************/
/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req x0
- START_COL .req x1
- WORKSPACE .req x2
- TMP1 .req x9
- TMP2 .req x10
- TMP3 .req x11
- TMP4 .req x12
- TMP5 .req x13
- TMP6 .req x14
- TMP7 .req x15
- TMP8 .req x4
- TMPDUP .req w3
-
- /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x1 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x1, w1
-
- mov TMPDUP, #128
- ldp TMP1, TMP2, [SAMPLE_DATA], 16
- ldp TMP3, TMP4, [SAMPLE_DATA], 16
- dup v0.8b, TMPDUP
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- ldp TMP5, TMP6, [SAMPLE_DATA], 16
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- ldp TMP7, TMP8, [SAMPLE_DATA], 16
- add TMP5, TMP5, START_COL
- add TMP6, TMP6, START_COL
- ld1 {v16.8b}, [TMP1]
- add TMP7, TMP7, START_COL
- add TMP8, TMP8, START_COL
- ld1 {v17.8b}, [TMP2]
- usubl v16.8h, v16.8b, v0.8b
- ld1 {v18.8b}, [TMP3]
- usubl v17.8h, v17.8b, v0.8b
- ld1 {v19.8b}, [TMP4]
- usubl v18.8h, v18.8b, v0.8b
- ld1 {v20.8b}, [TMP5]
- usubl v19.8h, v19.8b, v0.8b
- ld1 {v21.8b}, [TMP6]
- st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
- usubl v20.8h, v20.8b, v0.8b
- ld1 {v22.8b}, [TMP7]
- usubl v21.8h, v21.8b, v0.8b
- ld1 {v23.8b}, [TMP8]
- usubl v22.8h, v22.8b, v0.8b
- usubl v23.8h, v23.8b, v0.8b
- st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
- br x30
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
- .unreq TMPDUP
-
-/*****************************************************************************/
-
-/*
* jsimd_fdct_islow_neon
*
- * This file contains a slow-but-accurate integer implementation of the
+ * This file contains a slower but more accurate integer implementation of the
* forward DCT (Discrete Cosine Transform). The following code is based
* directly on the IJG''s original jfdctint.c; see the jfdctint.c for
* more details.
@@ -2234,68 +1541,24 @@ asm_function jsimd_convsamp_neon
* rid of a bunch of VLD1.16 instructions
*/
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
-
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_fdct_islow_neon_consts:
- .short F_0_298
- .short -F_0_390
- .short F_0_541
- .short F_0_765
- .short - F_0_899
- .short F_1_175
- .short F_1_501
- .short - F_1_847
- .short - F_1_961
- .short F_2_053
- .short - F_2_562
- .short F_3_072
- .short 0 /* padding */
- .short 0
- .short 0
- .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
asm_function jsimd_fdct_islow_neon
@@ -2303,16 +1566,16 @@ asm_function jsimd_fdct_islow_neon
TMP .req x9
/* Load constants */
- adr TMP, Ljsimd_fdct_islow_neon_consts
+ get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts
ld1 {v0.8h, v1.8h}, [TMP]
- /* Save NEON registers */
+ /* Save Neon registers */
sub sp, sp, #64
mov x10, sp
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
- /* Load all DATA into NEON registers with the following allocation:
+ /* Load all DATA into Neon registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
* ---------+--------
* 0 | d16 | d17 | v16.8h
@@ -2353,8 +1616,8 @@ asm_function jsimd_fdct_islow_neon
add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
- shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
- shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+ shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+ shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
@@ -2368,8 +1631,8 @@ asm_function jsimd_fdct_islow_neon
rshrn v18.4h, v18.4s, #DESCALE_P1
rshrn v22.4h, v22.4s, #DESCALE_P1
- rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
- rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+ rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+ rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
/* Odd part */
@@ -2395,10 +1658,10 @@ asm_function jsimd_fdct_islow_neon
smull2 v13.4s, v9.8h, XFIX_N_2_562
smull2 v14.4s, v10.8h, XFIX_N_1_961
smull2 v15.4s, v11.8h, XFIX_N_0_390
- smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
- smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
- smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
- smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
add v10.4s, v10.4s, v4.4s /* z3 += z5 */
add v14.4s, v14.4s, v5.4s
@@ -2427,10 +1690,10 @@ asm_function jsimd_fdct_islow_neon
rshrn v21.4h, v29.4s, #DESCALE_P1
rshrn v19.4h, v30.4s, #DESCALE_P1
rshrn v17.4h, v31.4s, #DESCALE_P1
- rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
- rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
- rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
- rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+ rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+ rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
/* Transpose */
transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
@@ -2456,8 +1719,8 @@ asm_function jsimd_fdct_islow_neon
add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
- srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
- srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
+ srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
+ srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
@@ -2471,8 +1734,8 @@ asm_function jsimd_fdct_islow_neon
rshrn v18.4h, v18.4s, #DESCALE_P2
rshrn v22.4h, v22.4s, #DESCALE_P2
- rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
- rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+ rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+ rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
/* Odd part */
add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
@@ -2498,10 +1761,10 @@ asm_function jsimd_fdct_islow_neon
smull2 v13.4s, v9.8h, XFIX_N_2_562
smull2 v14.4s, v10.8h, XFIX_N_1_961
smull2 v15.4s, v11.8h, XFIX_N_0_390
- smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
- smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
- smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
- smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
add v10.4s, v10.4s, v4.4s
add v14.4s, v14.4s, v5.4s
@@ -2530,16 +1793,16 @@ asm_function jsimd_fdct_islow_neon
rshrn v21.4h, v29.4s, #DESCALE_P2
rshrn v19.4h, v30.4s, #DESCALE_P2
rshrn v17.4h, v31.4s, #DESCALE_P2
- rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
- rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
- rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
- rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+ rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+ rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
/* store results */
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
- /* Restore NEON registers */
+ /* Restore Neon registers */
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
@@ -2565,405 +1828,10 @@ asm_function jsimd_fdct_islow_neon
/*****************************************************************************/
/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- * rid of a bunch of VLD1.16 instructions
- */
-
-#undef XFIX_0_541196100
-#define XFIX_0_382683433 v0.h[0]
-#define XFIX_0_541196100 v0.h[1]
-#define XFIX_0_707106781 v0.h[2]
-#define XFIX_1_306562965 v0.h[3]
-
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
- .short (98 * 128) /* XFIX_0_382683433 */
- .short (139 * 128) /* XFIX_0_541196100 */
- .short (181 * 128) /* XFIX_0_707106781 */
- .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
- DATA .req x0
- TMP .req x9
-
- /* Load constants */
- adr TMP, Ljsimd_fdct_ifast_neon_consts
- ld1 {v0.4h}, [TMP]
-
- /* Load all DATA into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 | v0.8h
- * 1 | d18 | d19 | q9
- * 2 | d20 | d21 | q10
- * 3 | d22 | d23 | q11
- * 4 | d24 | d25 | q12
- * 5 | d26 | d27 | q13
- * 6 | d28 | d29 | q14
- * 7 | d30 | d31 | q15
- */
-
- ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
- ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
- mov TMP, #2
- sub DATA, DATA, #64
-1:
- /* Transpose */
- transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
- subs TMP, TMP, #1
- /* 1-D FDCT */
- add v4.8h, v19.8h, v20.8h
- sub v20.8h, v19.8h, v20.8h
- sub v28.8h, v18.8h, v21.8h
- add v18.8h, v18.8h, v21.8h
- sub v29.8h, v17.8h, v22.8h
- add v17.8h, v17.8h, v22.8h
- sub v21.8h, v16.8h, v23.8h
- add v16.8h, v16.8h, v23.8h
- sub v6.8h, v17.8h, v18.8h
- sub v7.8h, v16.8h, v4.8h
- add v5.8h, v17.8h, v18.8h
- add v6.8h, v6.8h, v7.8h
- add v4.8h, v16.8h, v4.8h
- sqdmulh v6.8h, v6.8h, XFIX_0_707106781
- add v19.8h, v20.8h, v28.8h
- add v16.8h, v4.8h, v5.8h
- sub v20.8h, v4.8h, v5.8h
- add v5.8h, v28.8h, v29.8h
- add v29.8h, v29.8h, v21.8h
- sqdmulh v5.8h, v5.8h, XFIX_0_707106781
- sub v28.8h, v19.8h, v29.8h
- add v18.8h, v7.8h, v6.8h
- sqdmulh v28.8h, v28.8h, XFIX_0_382683433
- sub v22.8h, v7.8h, v6.8h
- sqdmulh v19.8h, v19.8h, XFIX_0_541196100
- sqdmulh v7.8h, v29.8h, XFIX_1_306562965
- add v6.8h, v21.8h, v5.8h
- sub v5.8h, v21.8h, v5.8h
- add v29.8h, v29.8h, v28.8h
- add v19.8h, v19.8h, v28.8h
- add v29.8h, v29.8h, v7.8h
- add v21.8h, v5.8h, v19.8h
- sub v19.8h, v5.8h, v19.8h
- add v17.8h, v6.8h, v29.8h
- sub v23.8h, v6.8h, v29.8h
-
- b.ne 1b
-
- /* store results */
- st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
- st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-
- br x30
-
- .unreq DATA
- .unreq TMP
-#undef XFIX_0_382683433
-#undef XFIX_0_541196100
-#undef XFIX_0_707106781
-#undef XFIX_1_306562965
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
- * DCTELEM *workspace);
- *
- */
-asm_function jsimd_quantize_neon
-
- COEF_BLOCK .req x0
- DIVISORS .req x1
- WORKSPACE .req x2
-
- RECIPROCAL .req DIVISORS
- CORRECTION .req x9
- SHIFT .req x10
- LOOP_COUNT .req x11
-
- mov LOOP_COUNT, #2
- add CORRECTION, DIVISORS, #(64 * 2)
- add SHIFT, DIVISORS, #(64 * 6)
-1:
- subs LOOP_COUNT, LOOP_COUNT, #1
- ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
- ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
- abs v20.8h, v0.8h
- abs v21.8h, v1.8h
- abs v22.8h, v2.8h
- abs v23.8h, v3.8h
- ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
- add v20.8h, v20.8h, v4.8h /* add correction */
- add v21.8h, v21.8h, v5.8h
- add v22.8h, v22.8h, v6.8h
- add v23.8h, v23.8h, v7.8h
- umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */
- umull2 v16.4s, v20.8h, v28.8h
- umull v5.4s, v21.4h, v29.4h
- umull2 v17.4s, v21.8h, v29.8h
- umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */
- umull2 v18.4s, v22.8h, v30.8h
- umull v7.4s, v23.4h, v31.4h
- umull2 v19.4s, v23.8h, v31.8h
- ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
- shrn v4.4h, v4.4s, #16
- shrn v5.4h, v5.4s, #16
- shrn v6.4h, v6.4s, #16
- shrn v7.4h, v7.4s, #16
- shrn2 v4.8h, v16.4s, #16
- shrn2 v5.8h, v17.4s, #16
- shrn2 v6.8h, v18.4s, #16
- shrn2 v7.8h, v19.4s, #16
- neg v24.8h, v24.8h
- neg v25.8h, v25.8h
- neg v26.8h, v26.8h
- neg v27.8h, v27.8h
- sshr v0.8h, v0.8h, #15 /* extract sign */
- sshr v1.8h, v1.8h, #15
- sshr v2.8h, v2.8h, #15
- sshr v3.8h, v3.8h, #15
- ushl v4.8h, v4.8h, v24.8h /* shift */
- ushl v5.8h, v5.8h, v25.8h
- ushl v6.8h, v6.8h, v26.8h
- ushl v7.8h, v7.8h, v27.8h
-
- eor v4.16b, v4.16b, v0.16b /* restore sign */
- eor v5.16b, v5.16b, v1.16b
- eor v6.16b, v6.16b, v2.16b
- eor v7.16b, v7.16b, v3.16b
- sub v4.8h, v4.8h, v0.8h
- sub v5.8h, v5.8h, v1.8h
- sub v6.8h, v6.8h, v2.8h
- sub v7.8h, v7.8h, v3.8h
- st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
-
- b.ne 1b
-
- br x30 /* return */
-
- .unreq COEF_BLOCK
- .unreq DIVISORS
- .unreq WORKSPACE
- .unreq RECIPROCAL
- .unreq CORRECTION
- .unreq SHIFT
- .unreq LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 1:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
- * JDIMENSION v_samp_factor,
- * JDIMENSION width_blocks, JSAMPARRAY input_data,
- * JSAMPARRAY output_data);
- */
-
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
- 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
- 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
- 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
- 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
- .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
- 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
- .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
- 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
- .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
-
-asm_function jsimd_h2v1_downsample_neon
- IMAGE_WIDTH .req x0
- MAX_V_SAMP .req x1
- V_SAMP .req x2
- BLOCK_WIDTH .req x3
- INPUT_DATA .req x4
- OUTPUT_DATA .req x5
- OUTPTR .req x9
- INPTR .req x10
- TMP1 .req x11
- TMP2 .req x12
- TMP3 .req x13
- TMPDUP .req w15
-
- mov TMPDUP, #0x10000
- lsl TMP2, BLOCK_WIDTH, #4
- sub TMP2, TMP2, IMAGE_WIDTH
- adr TMP3, Ljsimd_h2_downsample_neon_consts
- add TMP3, TMP3, TMP2, lsl #4
- dup v16.4s, TMPDUP
- ld1 {v18.16b}, [TMP3]
-
-1: /* row loop */
- ldr INPTR, [INPUT_DATA], #8
- ldr OUTPTR, [OUTPUT_DATA], #8
- subs TMP1, BLOCK_WIDTH, #1
- b.eq 3f
-2: /* columns */
- ld1 {v0.16b}, [INPTR], #16
- mov v4.16b, v16.16b
- subs TMP1, TMP1, #1
- uadalp v4.8h, v0.16b
- shrn v6.8b, v4.8h, #1
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 2b
-3: /* last columns */
- ld1 {v0.16b}, [INPTR]
- mov v4.16b, v16.16b
- subs V_SAMP, V_SAMP, #1
- /* expand right */
- tbl v2.16b, {v0.16b}, v18.16b
- uadalp v4.8h, v2.16b
- shrn v6.8b, v4.8h, #1
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 1b
-
- br x30
-
- .unreq IMAGE_WIDTH
- .unreq MAX_V_SAMP
- .unreq V_SAMP
- .unreq BLOCK_WIDTH
- .unreq INPUT_DATA
- .unreq OUTPUT_DATA
- .unreq OUTPTR
- .unreq INPTR
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 2:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
- * JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- * JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-.balign 16
-asm_function jsimd_h2v2_downsample_neon
- IMAGE_WIDTH .req x0
- MAX_V_SAMP .req x1
- V_SAMP .req x2
- BLOCK_WIDTH .req x3
- INPUT_DATA .req x4
- OUTPUT_DATA .req x5
- OUTPTR .req x9
- INPTR0 .req x10
- INPTR1 .req x14
- TMP1 .req x11
- TMP2 .req x12
- TMP3 .req x13
- TMPDUP .req w15
-
- mov TMPDUP, #1
- lsl TMP2, BLOCK_WIDTH, #4
- lsl TMPDUP, TMPDUP, #17
- sub TMP2, TMP2, IMAGE_WIDTH
- adr TMP3, Ljsimd_h2_downsample_neon_consts
- orr TMPDUP, TMPDUP, #1
- add TMP3, TMP3, TMP2, lsl #4
- dup v16.4s, TMPDUP
- ld1 {v18.16b}, [TMP3]
-
-1: /* row loop */
- ldr INPTR0, [INPUT_DATA], #8
- ldr OUTPTR, [OUTPUT_DATA], #8
- ldr INPTR1, [INPUT_DATA], #8
- subs TMP1, BLOCK_WIDTH, #1
- b.eq 3f
-2: /* columns */
- ld1 {v0.16b}, [INPTR0], #16
- ld1 {v1.16b}, [INPTR1], #16
- mov v4.16b, v16.16b
- subs TMP1, TMP1, #1
- uadalp v4.8h, v0.16b
- uadalp v4.8h, v1.16b
- shrn v6.8b, v4.8h, #2
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 2b
-3: /* last columns */
- ld1 {v0.16b}, [INPTR0], #16
- ld1 {v1.16b}, [INPTR1], #16
- mov v4.16b, v16.16b
- subs V_SAMP, V_SAMP, #1
- /* expand right */
- tbl v2.16b, {v0.16b}, v18.16b
- tbl v3.16b, {v1.16b}, v18.16b
- uadalp v4.8h, v2.16b
- uadalp v4.8h, v3.16b
- shrn v6.8b, v4.8h, #2
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 1b
-
- br x30
-
- .unreq IMAGE_WIDTH
- .unreq MAX_V_SAMP
- .unreq V_SAMP
- .unreq BLOCK_WIDTH
- .unreq INPUT_DATA
- .unreq OUTPUT_DATA
- .unreq OUTPTR
- .unreq INPTR0
- .unreq INPTR1
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
- * JCOEFPTR block, int last_dc_val,
- * c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ * JCOEFPTR block, int last_dc_val,
+ * c_derived_tbl *dctbl, c_derived_tbl *actbl)
*
*/
@@ -3010,41 +1878,6 @@ asm_function jsimd_h2v2_downsample_neon
.macro generate_jsimd_huff_encode_one_block fast_tbl
-.balign 16
-.if \fast_tbl == 1
-Ljsimd_huff_encode_one_block_neon_consts:
-.else
-Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
-.endif
- .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
- 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-.if \fast_tbl == 1
- .byte 0, 1, 2, 3, 16, 17, 32, 33, \
- 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
- .byte 34, 35, 48, 49, 255, 255, 50, 51, \
- 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
- .byte 8, 9, 22, 23, 36, 37, 50, 51, \
- 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
- .byte 54, 55, 40, 41, 26, 27, 12, 13, \
- 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
- .byte 6, 7, 20, 21, 34, 35, 48, 49, \
- 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
- .byte 42, 43, 28, 29, 14, 15, 30, 31, \
- 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
- .byte 255, 255, 255, 255, 56, 57, 42, 43, \
- 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
- .byte 26, 27, 40, 41, 42, 43, 28, 29, \
- 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
- .byte 255, 255, 255, 255, 0, 1, 255, 255, \
- 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
- .byte 255, 255, 255, 255, 255, 255, 255, 255, \
- 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
- .byte 255, 255, 255, 255, 255, 255, 255, 255, \
- 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
- .byte 4, 5, 6, 7, 255, 255, 255, 255, \
- 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
-.endif
-
.if \fast_tbl == 1
asm_function jsimd_huff_encode_one_block_neon
.else
@@ -3052,13 +1885,9 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
.endif
sub sp, sp, 272
sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
- /* Save ARM registers */
+ /* Save Arm registers */
stp x19, x20, [sp]
-.if \fast_tbl == 1
- adr x15, Ljsimd_huff_encode_one_block_neon_consts
-.else
- adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
-.endif
+ get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts
ldr PUT_BUFFER, [x0, #0x10]
ldr PUT_BITSw, [x0, #0x18]
ldrsh w12, [x2] /* load DC coeff in w12 */
@@ -3278,7 +2107,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
put_bits x10, x11
addp v16.16b, v16.16b, v18.16b
checkbuf47
- umov x9,v16.D[0]
+ umov x9, v16.D[0]
put_bits x13, x12
cnt v17.8b, v16.8b
mvn x9, x9
diff --git a/media/libjpeg/simd/arm/align.h b/media/libjpeg/simd/arm/align.h
new file mode 100644
index 0000000000..cff4241e84
--- /dev/null
+++ b/media/libjpeg/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment) __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment) __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/arm/jccolor-neon.c b/media/libjpeg/simd/arm/jccolor-neon.c
new file mode 100644
index 0000000000..9fcc62dd25
--- /dev/null
+++ b/media/libjpeg/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298 19595
+#define F_0_587 38470
+#define F_0_113 7471
+#define F_0_168 11059
+#define F_0_331 21709
+#define F_0_500 32768
+#define F_0_418 27439
+#define F_0_081 5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+ F_0_298, F_0_587, F_0_113, F_0_168,
+ F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgray-neon.c b/media/libjpeg/simd/arm/jcgray-neon.c
new file mode 100644
index 0000000000..71c7b2de21
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298 19595
+#define F_0_587 38470
+#define F_0_113 7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgryext-neon.c b/media/libjpeg/simd/arm/jcgryext-neon.c
new file mode 100644
index 0000000000..416a7385df
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgryext-neon.c
@@ -0,0 +1,106 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ JSAMPROW inptr;
+ JSAMPROW outptr;
+ /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining > 0; cols_remaining -= 16) {
+
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 16) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ if (cols_remaining < 16) {
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+ }
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+ uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+ uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+ uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+ y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+ y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+ y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+ y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+ y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+ y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+ y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+ y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+
+ /* Narrow Y values to 8-bit and store to memory. Buffer overwrite is
+ * permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+ /* Increment pointers. */
+ inptr += (16 * RGB_PIXELSIZE);
+ outptr += 16;
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/jchuff.h b/media/libjpeg/simd/arm/jchuff.h
new file mode 100644
index 0000000000..2fbd252b9b
--- /dev/null
+++ b/media/libjpeg/simd/arm/jchuff.h
@@ -0,0 +1,131 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020-2021, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE 64
+#else
+#define BIT_BUF_SIZE 32
+#endif
+
+typedef struct {
+ size_t put_buffer; /* current bit accumulation buffer */
+ int free_bits; /* # of bits available in it */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+ JOCTET *next_output_byte; /* => next byte to write in buffer */
+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */
+ savable_state cur; /* Current bit buffer & DC state */
+ j_compress_ptr cinfo; /* dump_buffer needs access to this */
+ int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF. Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+ buffer[0] = (JOCTET)(b); \
+ buffer[1] = 0; \
+ buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
+ * directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define FLUSH() { \
+ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+ EMIT_BYTE(put_buffer >> 56) \
+ EMIT_BYTE(put_buffer >> 48) \
+ EMIT_BYTE(put_buffer >> 40) \
+ EMIT_BYTE(put_buffer >> 32) \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
+ buffer += 8; \
+ } \
+}
+
+#else
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+ buffer[0] = (JOCTET)(put_buffer >> 24); \
+ buffer[1] = (JOCTET)(put_buffer >> 16); \
+ buffer[2] = (JOCTET)(put_buffer >> 8); \
+ buffer[3] = (JOCTET)(put_buffer ); \
+ buffer += 4; \
+}
+#else
+#define SPLAT() { \
+ put_buffer = __builtin_bswap32(put_buffer); \
+ __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
+#define FLUSH() { \
+ if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ SPLAT(); \
+ } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+ FLUSH() \
+ free_bits += BIT_BUF_SIZE; \
+ put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+ free_bits -= size; \
+ if (free_bits < 0) \
+ PUT_AND_FLUSH(code, size) \
+ else \
+ put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+ diff |= code << nbits; \
+ nbits += size; \
+ PUT_BITS(diff, nbits) \
+}
diff --git a/media/libjpeg/simd/arm/jcphuff-neon.c b/media/libjpeg/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000000..b91c5db478
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcphuff-neon.c
@@ -0,0 +1,622 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits)
+{
+ JCOEF *values_ptr = values;
+ JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+ int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+ int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs1);
+ vst1q_s16(values_ptr + DCTSIZE, coefs2);
+ vst1q_s16(diff_values_ptr, diff1);
+ vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+ int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+ int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs1);
+ vst1q_s16(values_ptr + DCTSIZE, coefs2);
+ vst1q_s16(diff_values_ptr, diff1);
+ vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs = vabsq_s16(coefs);
+ coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs);
+ vst1q_s16(diff_values_ptr, diff);
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in the values and diff_values blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_s16(values_ptr, vdupq_n_s16(0));
+ vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. A set bit means that the corresponding
+ * coefficient != 0.
+ */
+ int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+ int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+ uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+ uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+ uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+ uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+ uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+ uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+ uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+ uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+ /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+ row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+ row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+ row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+ row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+ row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+ row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+ row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+ row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ *zerobits = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ zerobits[0] = ~bitmap0;
+ zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits)
+{
+ /* Temporary storage buffers for data used to compute the signbits bitmap and
+ * the end-of-block (EOB) position
+ */
+ uint8_t coef_sign_bits[64];
+ uint8_t coef_eq1_bits[64];
+
+ JCOEF *absvalues_ptr = absvalues;
+ uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+ uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs1);
+ vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs1);
+ vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs = vabsq_s16(coefs);
+ coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+ vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+ vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. */
+ int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+ int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+ int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+ int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+ int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+ int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+ int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+ int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+ uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+ uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+ uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+ uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+ uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+ uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+ uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+ uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+ /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+ abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+ abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+ abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+ abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+ abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+ abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+ abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+ abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap0;
+ bits[1] = ~bitmap1;
+#endif
+
+ /* Construct signbits bitmap. */
+ uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+ uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+ uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+ uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+ uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+ uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+ uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+ uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+ signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+ signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+ signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+ signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+ signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+ signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+ signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+ signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+ bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+ bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+ bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store signbits bitmap. */
+ bits[1] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store signbits bitmap. */
+ bits[2] = ~bitmap0;
+ bits[3] = ~bitmap1;
+#endif
+
+ /* Construct bitmap to find EOB position (the index of the last coefficient
+ * equal to 1.)
+ */
+ uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+ uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+ uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+ uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+ uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+ uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+ uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+ uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+ row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+ row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+ row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+ row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+ row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+ row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+ row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+ row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+ bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+ bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+ bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+ /* Return EOB position. */
+ if (bitmap == 0) {
+ /* EOB position is defined to be 0 if all coefficients != 1. */
+ return 0;
+ } else {
+ return 63 - BUILTIN_CLZLL(bitmap);
+ }
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+ /* Return EOB position. */
+ if (bitmap0 == 0 && bitmap1 == 0) {
+ return 0;
+ } else if (bitmap1 != 0) {
+ return 63 - BUILTIN_CLZ(bitmap1);
+ } else {
+ return 31 - BUILTIN_CLZ(bitmap0);
+ }
+#endif
+}
diff --git a/media/libjpeg/simd/arm/jcsample-neon.c b/media/libjpeg/simd/arm/jcsample-neon.c
new file mode 100644
index 0000000000..8a3e237838
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcsample-neon.c
@@ -0,0 +1,192 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
+ 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
+ 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
+ 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
+ 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ JSAMPROW inptr, outptr;
+ /* Load expansion mask to pad remaining elements of last DCT block. */
+ const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+ const uint8x16_t expand_mask =
+ vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern (alternating every pixel.) */
+ /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+ const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+ unsigned i, outrow;
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr = input_data[outrow];
+
+ /* Downsample all but the last DCT block of pixels. */
+ for (i = 0; i < width_in_blocks - 1; i++) {
+ uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+ /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+ /* Divide total by 2 and narrow to 8-bit. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+ /* Store samples to memory. */
+ vst1_u8(outptr + i * DCTSIZE, samples_u8);
+ }
+
+ /* Load pixels in last DCT block into a table. */
+ uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Pad the empty elements with the value of the last pixel. */
+ pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+ uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+ pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+ vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+ /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+ /* Divide total by 2, narrow to 8-bit, and store. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+ vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+ }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ JSAMPROW inptr0, inptr1, outptr;
+ /* Load expansion mask to pad remaining elements of last DCT block. */
+ const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+ const uint8x16_t expand_mask =
+ vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern (alternating every pixel.) */
+ /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+ const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+ unsigned i, outrow;
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr0 = input_data[outrow];
+ inptr1 = input_data[outrow + 1];
+
+ /* Downsample all but the last DCT block of pixels. */
+ for (i = 0; i < width_in_blocks - 1; i++) {
+ uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+ /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+ */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4 and narrow to 8-bit. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ /* Store samples to memory and increment pointers. */
+ vst1_u8(outptr + i * DCTSIZE, samples_u8);
+ }
+
+ /* Load pixels in last DCT block into a table. */
+ uint8x16_t pixels_r0 =
+ vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 =
+ vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Pad the empty elements with the value of the last pixel. */
+ pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+ pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+ uint8x8x2_t table_r0 =
+ { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+ uint8x8x2_t table_r1 =
+ { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+ pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+ pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+ /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4, narrow to 8-bit, and store. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+ }
+}
diff --git a/media/libjpeg/simd/arm/jdcolext-neon.c b/media/libjpeg/simd/arm/jdcolext-neon.c
new file mode 100644
index 0000000000..c3c07a1964
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolext-neon.c
@@ -0,0 +1,374 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ * R = Y + 1.40200 * (Cr - 128)
+ * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ * B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.3441467 = 11277 * 2^-15
+ * 0.7141418 = 23401 * 2^-15
+ * 1.4020386 = 22971 * 2^-14
+ * 1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ JSAMPROW outptr;
+ /* Pointers to Y, Cb, and Cr data */
+ JSAMPROW inptr0, inptr1, inptr2;
+
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ uint8x16_t y = vld1q_u8(inptr0);
+ uint8x16_t cb = vld1q_u8(inptr1);
+ uint8x16_t cr = vld1q_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_low_u8(cr)));
+ int16x8_t cr_128_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_high_u8(cr)));
+ int16x8_t cb_128_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_low_u8(cb)));
+ int16x8_t cb_128_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_high_u8(cb)));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+ int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+ consts, 0);
+ int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+ int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+ consts, 0);
+ g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+ consts, 1);
+ g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+ consts, 1);
+ g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+ consts, 1);
+ g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+ consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+ vrshrn_n_s32(g_sub_y_lh, 15));
+ int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+ vrshrn_n_s32(g_sub_y_hh, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+ consts, 2);
+ int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+ consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+ consts, 3);
+ int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+ consts, 3);
+ /* Add Y. */
+ int16x8_t r_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t r_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+ vget_high_u8(y)));
+ int16x8_t b_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t b_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+ vget_high_u8(y)));
+ int16x8_t g_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t g_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+ vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+ rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+ rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+ uint8x16x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+ rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+ rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr, rgb);
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
+ uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+ uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565_l);
+ vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 16;
+ inptr1 += 16;
+ inptr2 += 16;
+ outptr += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining >= 8) {
+ uint8x8_t y = vld1_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+ consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+ consts, 3);
+ /* Add Y. */
+ int16x8_t r =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vqmovun_s16(r);
+ rgba.val[RGB_GREEN] = vqmovun_s16(g);
+ rgba.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+ uint8x8x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vqmovun_s16(r);
+ rgb.val[RGB_GREEN] = vqmovun_s16(g);
+ rgb.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Store RGB pixel data to memory. */
+ vst3_u8(outptr, rgb);
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 8;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr += (RGB_PIXELSIZE * 8);
+ cols_remaining -= 8;
+ }
+
+ /* Handle the tail elements. */
+ if (cols_remaining > 0) {
+ uint8x8_t y = vld1_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+ consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+ consts, 3);
+ /* Add Y. */
+ int16x8_t r =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vqmovun_s16(r);
+ rgba.val[RGB_GREEN] = vqmovun_s16(g);
+ rgba.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 7:
+ vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst4_lane_u8(outptr, rgba, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#elif RGB_PIXELSIZE == 3
+ uint8x8x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vqmovun_s16(r);
+ rgb.val[RGB_GREEN] = vqmovun_s16(g);
+ rgb.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 7:
+ vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst3_lane_u8(outptr, rgb, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB565 pixel data to memory. */
+ switch (cols_remaining) {
+ case 7:
+ vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#endif
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/jdcolor-neon.c b/media/libjpeg/simd/arm/jdcolor-neon.c
new file mode 100644
index 0000000000..ea4668f1d3
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolor-neon.c
@@ -0,0 +1,142 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+ -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE 2
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/media/libjpeg/simd/arm/jdmerge-neon.c b/media/libjpeg/simd/arm/jdmerge-neon.c
new file mode 100644
index 0000000000..e4f91fdc0e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmerge-neon.c
@@ -0,0 +1,145 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+ -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
diff --git a/media/libjpeg/simd/arm/jdmrgext-neon.c b/media/libjpeg/simd/arm/jdmrgext-neon.c
new file mode 100644
index 0000000000..5b89bdb339
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,723 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ * R = Y + 1.40200 * (Cr - 128)
+ * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ * B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.3441467 = 11277 * 2^-15
+ * 0.7141418 = 23401 * 2^-15
+ * 1.4020386 = 22971 * 2^-14
+ * 1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr;
+ /* Pointers to Y, Cb, and Cr data */
+ JSAMPROW inptr0, inptr1, inptr2;
+
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ /* De-interleave Y component values into two separate vectors, one
+ * containing the component values with even-numbered indices and one
+ * containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y = vld2_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+ * "odd" Y component values. This effectively upsamples the chroma
+ * components horizontally.
+ */
+ int16x8_t g_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[0]));
+ int16x8_t r_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[0]));
+ int16x8_t b_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[0]));
+ int16x8_t g_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[1]));
+ int16x8_t r_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[1]));
+ int16x8_t b_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+ uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+ uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+ uint8x16x4_t rgba;
+ rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+ rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+ rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr, rgba);
+#else
+ uint8x16x3_t rgb;
+ rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+ rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+ rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr, rgb);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 16;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining > 0) {
+ /* De-interleave Y component values into two separate vectors, one
+ * containing the component values with even-numbered indices and one
+ * containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y = vld2_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+ * "odd" Y component values. This effectively upsamples the chroma
+ * components horizontally.
+ */
+ int16x8_t g_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[0]));
+ int16x8_t r_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[0]));
+ int16x8_t b_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[0]));
+ int16x8_t g_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[1]));
+ int16x8_t r_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[1]));
+ int16x8_t b_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+ uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+ uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+ uint8x8x4_t rgba_h;
+ rgba_h.val[RGB_RED] = r.val[1];
+ rgba_h.val[RGB_GREEN] = g.val[1];
+ rgba_h.val[RGB_BLUE] = b.val[1];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ uint8x8x4_t rgba_l;
+ rgba_l.val[RGB_RED] = r.val[0];
+ rgba_l.val[RGB_GREEN] = g.val[0];
+ rgba_l.val[RGB_BLUE] = b.val[0];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst4_u8(outptr, rgba_l);
+ break;
+ case 7:
+ vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst4_lane_u8(outptr, rgba_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#else
+ uint8x8x3_t rgb_h;
+ rgb_h.val[RGB_RED] = r.val[1];
+ rgb_h.val[RGB_GREEN] = g.val[1];
+ rgb_h.val[RGB_BLUE] = b.val[1];
+ uint8x8x3_t rgb_l;
+ rgb_l.val[RGB_RED] = r.val[0];
+ rgb_l.val[RGB_GREEN] = g.val[0];
+ rgb_l.val[RGB_BLUE] = b.val[0];
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst3_u8(outptr, rgb_l);
+ break;
+ case 7:
+ vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst3_lane_u8(outptr, rgb_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#endif
+ }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr0, outptr1;
+ /* Pointers to Y (both rows), Cb, and Cr data */
+ JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+ inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr0 = output_buf[0];
+ outptr1 = output_buf[1];
+
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ /* For each row, de-interleave Y component values into two separate
+ * vectors, one containing the component values with even-numbered indices
+ * and one containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y0 = vld2_u8(inptr0_0);
+ uint8x8x2_t y1 = vld2_u8(inptr0_1);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+ * the "even" and "odd" Y component values. This effectively upsamples the
+ * chroma components both horizontally and vertically.
+ */
+ int16x8_t g0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[0]));
+ int16x8_t r0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[0]));
+ int16x8_t b0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[0]));
+ int16x8_t g0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[1]));
+ int16x8_t r0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[1]));
+ int16x8_t b0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[1]));
+ int16x8_t g1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[0]));
+ int16x8_t r1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[0]));
+ int16x8_t b1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[0]));
+ int16x8_t g1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[1]));
+ int16x8_t r1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[1]));
+ int16x8_t b1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+ uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+ uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+ uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+ uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+ uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+ uint8x16x4_t rgba0, rgba1;
+ rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+ rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+ rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+ rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+ rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+ rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr0, rgba0);
+ vst4q_u8(outptr1, rgba1);
+#else
+ uint8x16x3_t rgb0, rgb1;
+ rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+ rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+ rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+ rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+ rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+ rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr0, rgb0);
+ vst3q_u8(outptr1, rgb1);
+#endif
+
+ /* Increment pointers. */
+ inptr0_0 += 16;
+ inptr0_1 += 16;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr0 += (RGB_PIXELSIZE * 16);
+ outptr1 += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining > 0) {
+ /* For each row, de-interleave Y component values into two separate
+ * vectors, one containing the component values with even-numbered indices
+ * and one containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y0 = vld2_u8(inptr0_0);
+ uint8x8x2_t y1 = vld2_u8(inptr0_1);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+ * the "even" and "odd" Y component values. This effectively upsamples the
+ * chroma components both horizontally and vertically.
+ */
+ int16x8_t g0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[0]));
+ int16x8_t r0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[0]));
+ int16x8_t b0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[0]));
+ int16x8_t g0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[1]));
+ int16x8_t r0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[1]));
+ int16x8_t b0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[1]));
+ int16x8_t g1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[0]));
+ int16x8_t r1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[0]));
+ int16x8_t b1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[0]));
+ int16x8_t g1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[1]));
+ int16x8_t r1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[1]));
+ int16x8_t b1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+ uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+ uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+ uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+ uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+ uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+ uint8x8x4_t rgba0_h, rgba1_h;
+ rgba0_h.val[RGB_RED] = r0.val[1];
+ rgba1_h.val[RGB_RED] = r1.val[1];
+ rgba0_h.val[RGB_GREEN] = g0.val[1];
+ rgba1_h.val[RGB_GREEN] = g1.val[1];
+ rgba0_h.val[RGB_BLUE] = b0.val[1];
+ rgba1_h.val[RGB_BLUE] = b1.val[1];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+ uint8x8x4_t rgba0_l, rgba1_l;
+ rgba0_l.val[RGB_RED] = r0.val[0];
+ rgba1_l.val[RGB_RED] = r1.val[0];
+ rgba0_l.val[RGB_GREEN] = g0.val[0];
+ rgba1_l.val[RGB_GREEN] = g1.val[0];
+ rgba0_l.val[RGB_BLUE] = b0.val[0];
+ rgba1_l.val[RGB_BLUE] = b1.val[0];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+ vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+ vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+ vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+ vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+ vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+ vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+ vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst4_u8(outptr0, rgba0_l);
+ vst4_u8(outptr1, rgba1_l);
+ break;
+ case 7:
+ vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+ vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+ vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+ vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+ vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+ vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+ vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst4_lane_u8(outptr0, rgba0_l, 0);
+ vst4_lane_u8(outptr1, rgba1_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#else
+ uint8x8x3_t rgb0_h, rgb1_h;
+ rgb0_h.val[RGB_RED] = r0.val[1];
+ rgb1_h.val[RGB_RED] = r1.val[1];
+ rgb0_h.val[RGB_GREEN] = g0.val[1];
+ rgb1_h.val[RGB_GREEN] = g1.val[1];
+ rgb0_h.val[RGB_BLUE] = b0.val[1];
+ rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+ uint8x8x3_t rgb0_l, rgb1_l;
+ rgb0_l.val[RGB_RED] = r0.val[0];
+ rgb1_l.val[RGB_RED] = r1.val[0];
+ rgb0_l.val[RGB_GREEN] = g0.val[0];
+ rgb1_l.val[RGB_GREEN] = g1.val[0];
+ rgb0_l.val[RGB_BLUE] = b0.val[0];
+ rgb1_l.val[RGB_BLUE] = b1.val[0];
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+ vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+ vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+ vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+ vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+ vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+ vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+ vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst3_u8(outptr0, rgb0_l);
+ vst3_u8(outptr1, rgb1_l);
+ break;
+ case 7:
+ vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+ vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+ vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+ vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+ vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+ vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+ vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst3_lane_u8(outptr0, rgb0_l, 0);
+ vst3_lane_u8(outptr1, rgb1_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#endif
+ }
+}
diff --git a/media/libjpeg/simd/arm/jdsample-neon.c b/media/libjpeg/simd/arm/jdsample-neon.c
new file mode 100644
index 0000000000..90ec6782c4
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ * s0 s1 s2
+ * +---------+---------+---------+
+ * | | | |
+ * | p0 p1 | p2 p3 | p4 p5 |
+ * | | | |
+ * +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above. To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1. For example:
+ * p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ * p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ * p0(upsampled) = s0
+ * p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ /* First pixel component value in this row of the original image */
+ *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+ /* 3/4 * containing sample + 1/4 * nearest neighboring sample
+ * For p1: containing sample = s0, nearest neighboring sample = s1
+ * For p2: containing sample = s1, nearest neighboring sample = s0
+ */
+ uint8x16_t s0 = vld1q_u8(inptr);
+ uint8x16_t s1 = vld1q_u8(inptr + 1);
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ uint16x8_t s1_add_3s0_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+ uint16x8_t s1_add_3s0_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+ uint16x8_t s0_add_3s1_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+ uint16x8_t s0_add_3s1_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+ /* The offset is initially 1, because the first pixel component has already
+ * been stored. However, in subsequent iterations of the SIMD loop, this
+ * offset is (2 * colctr - 1) to stay within the bounds of the sample
+ * buffers without having to resort to a slow scalar tail case for the last
+ * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
+ * in jmemmgr.c for more details.
+ */
+ unsigned outptr_offset = 1;
+ uint8x16x2_t output_pixels;
+
+ /* We use software pipelining to maximise performance. The code indented
+ * an extra two spaces begins the next iteration of the loop.
+ */
+ for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+ s0 = vld1q_u8(inptr + colctr - 1);
+ s1 = vld1q_u8(inptr + colctr);
+
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ s1_add_3s0_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+ s1_add_3s0_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+ s0_add_3s1_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+ s0_add_3s1_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+ outptr_offset = 2 * colctr - 1;
+ }
+
+ /* Complete the last iteration of the loop. */
+
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+
+ /* Last pixel component value in this row of the original image */
+ outptr[2 * downsampled_width - 1] =
+ GETJSAMPLE(inptr[downsampled_width - 1]);
+ }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ * s0 s1 s2
+ * +---------+---------+---------+
+ * | p0 p1 | p2 p3 | p4 p5 |
+ * sA | | | |
+ * | p6 p7 | p8 p9 | p10 p11|
+ * +---------+---------+---------+
+ * | p12 p13| p14 p15| p16 p17|
+ * sB | | | |
+ * | p18 p19| p20 p21| p22 p23|
+ * +---------+---------+---------+
+ * | p24 p25| p26 p27| p28 p29|
+ * sC | | | |
+ * | p30 p31| p32 p33| p34 p35|
+ * +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above. To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1. For example:
+ * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ * 1/4 * (3/4 * s0B + 1/4 * s0A)
+ * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ * p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ * p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ * p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ * p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ * p0(upsampled) = s0A
+ * p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t seven_u16 = vdupq_n_u16(7);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+ const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+ * respectively.
+ */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ /* First pixel component value in this row of the original image */
+ int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+ *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+ int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+ *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+ /* Step 1: Blend samples vertically in columns s0 and s1.
+ * Leave the divide by 4 until the end, when it can be done for both
+ * dimensions at once, right-shifting by 4.
+ */
+
+ /* Load and compute s0colsum0 and s0colsum1. */
+ uint8x16_t s0A = vld1q_u8(inptr0);
+ uint8x16_t s0B = vld1q_u8(inptr1);
+ uint8x16_t s0C = vld1q_u8(inptr2);
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+ vget_low_u8(s0B), three_u8);
+ uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+ vget_high_u8(s0B), three_u8);
+ uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+ vget_low_u8(s0B), three_u8);
+ uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+ vget_high_u8(s0B), three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+ uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+ uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+ uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+ vget_low_u8(s1B), three_u8);
+ uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+ vget_high_u8(s1B), three_u8);
+ uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+ vget_low_u8(s1B), three_u8);
+ uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+ vget_high_u8(s1B), three_u8);
+
+ /* Step 2: Blend the already-blended columns. */
+
+ uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+ uint8x16x2_t output_pixels0 = { {
+ vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+ } };
+ uint8x16x2_t output_pixels1 = { {
+ vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+ } };
+
+ /* Store pixel component values to memory.
+ * The minimum size of the output buffer for each row is 64 bytes => no
+ * need to worry about buffer overflow here. See "Creation of 2-D sample
+ * arrays" in jmemmgr.c for more details.
+ */
+ vst2q_u8(outptr0 + 1, output_pixels0);
+ vst2q_u8(outptr1 + 1, output_pixels1);
+
+ /* The first pixel of the image shifted our loads and stores by one byte.
+ * We have to re-align on a 32-byte boundary at some point before the end
+ * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+ * bounds of the sample buffers without having to resort to a slow scalar
+ * tail case for the last (downsampled_width % 16) samples. See "Creation
+ * of 2-D sample arrays" in jmemmgr.c for more details.
+ */
+ for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+ /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+ /* Load and compute s0colsum0 and s0colsum1. */
+ s0A = vld1q_u8(inptr0 + colctr - 1);
+ s0B = vld1q_u8(inptr1 + colctr - 1);
+ s0C = vld1q_u8(inptr2 + colctr - 1);
+ s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+ three_u8);
+ s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+ three_u8);
+ s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+ three_u8);
+ s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+ three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ s1A = vld1q_u8(inptr0 + colctr);
+ s1B = vld1q_u8(inptr1 + colctr);
+ s1C = vld1q_u8(inptr2 + colctr);
+ s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+ three_u8);
+ s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+ three_u8);
+ s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+ three_u8);
+ s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+ three_u8);
+
+ /* Step 2: Blend the already-blended columns. */
+
+ output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+ output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+ vshrn_n_u16(output0_p1_h, 4));
+ output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+ vrshrn_n_u16(output0_p2_h, 4));
+ output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+ vshrn_n_u16(output1_p1_h, 4));
+ output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+ vrshrn_n_u16(output1_p2_h, 4));
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+ vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+ }
+
+ /* Last pixel component value in this row of the original image */
+ int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr0[downsampled_width - 1]);
+ outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+ int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr2[downsampled_width - 1]);
+ outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+ inrow++;
+ }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ * +---------+
+ * | p0 |
+ * sA | |
+ * | p1 |
+ * +---------+
+ * | p2 |
+ * sB | |
+ * | p3 |
+ * +---------+
+ * | p4 |
+ * sC | |
+ * | p5 |
+ * +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above. To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1. For example:
+ * p1(upsampled) = 3/4 * sA + 1/4 * sB
+ * p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ * p0(upsampled) = sA
+ * p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+ * respectively.
+ */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+ inrow++;
+
+ /* The size of the input and output buffers is always a multiple of 32
+ * bytes => no need to worry about buffer overflow when reading/writing
+ * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
+ * details.
+ */
+ for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+ /* Load samples. */
+ uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+ uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+ uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+ /* Blend samples vertically. */
+ uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+ vget_high_u8(sB), three_u8);
+ uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+ vget_high_u8(sB), three_u8);
+ /* Add ordered dithering bias to pixel values in even output rows. */
+ colsum0_l = vaddq_u16(colsum0_l, one_u16);
+ colsum0_h = vaddq_u16(colsum0_h, one_u16);
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+ vshrn_n_u16(colsum0_h, 2));
+ uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+ vrshrn_n_u16(colsum1_h, 2));
+ /* Store pixel component values to memory. */
+ vst1q_u8(outptr0 + colctr, output_pixels0);
+ vst1q_u8(outptr1 + colctr, output_pixels1);
+ }
+ }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ * s0 s1
+ * +---------+---------+
+ * | | |
+ * | p0 p1 | p2 p3 |
+ * | | |
+ * +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above. To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ * p0(upsampled) = p1(upsampled) = s0
+ * p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow;
+ unsigned colctr;
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples. The store operation below interleaves them so
+ * that adjacent pixel component values take on the same sample value,
+ * per above.
+ */
+ uint8x16x2_t output_pixels = { { samples, samples } };
+ /* Store pixel component values to memory.
+ * Due to the way sample buffers are allocated, we don't need to worry
+ * about tail cases when output_width is not a multiple of 32. See
+ * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+ */
+ vst2q_u8(outptr + 2 * colctr, output_pixels);
+ }
+ }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ * s0 s1
+ * +---------+---------+
+ * | p0 p1 | p2 p3 |
+ * sA | | |
+ * | p4 p5 | p6 p7 |
+ * +---------+---------+
+ * | p8 p9 | p10 p11|
+ * sB | | |
+ * | p12 p13| p14 p15|
+ * +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above. To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples. The store operation below interleaves them so
+ * that adjacent pixel component values take on the same sample value,
+ * per above.
+ */
+ uint8x16x2_t output_pixels = { { samples, samples } };
+ /* Store pixel component values for both output rows to memory.
+ * Due to the way sample buffers are allocated, we don't need to worry
+ * about tail cases when output_width is not a multiple of 32. See
+ * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+ */
+ vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+ vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/jfdctfst-neon.c b/media/libjpeg/simd/arm/jfdctfst-neon.c
new file mode 100644
index 0000000000..bb371be399
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.382683433 = 12544 * 2^-15
+ * 0.541196100 = 17795 * 2^-15
+ * 0.707106781 = 23168 * 2^-15
+ * 0.306562965 = 9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382 12544
+#define F_0_541 17792
+#define F_0_707 23168
+#define F_0_306 9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+ F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
+ * are used, followed by vuzp to transpose the block such that we have a
+ * column of samples per vector - allowing all rows to be processed at once.
+ */
+ int16x8x4_t data1 = vld4q_s16(data);
+ int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+ int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+ int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+ int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+ int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+ int16x8_t col0 = cols_04.val[0];
+ int16x8_t col1 = cols_15.val[0];
+ int16x8_t col2 = cols_26.val[0];
+ int16x8_t col3 = cols_37.val[0];
+ int16x8_t col4 = cols_04.val[1];
+ int16x8_t col5 = cols_15.val[1];
+ int16x8_t col6 = cols_26.val[1];
+ int16x8_t col7 = cols_37.val[1];
+
+ /* Pass 1: process rows. */
+
+ /* Load DCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+ int16x8_t tmp0 = vaddq_s16(col0, col7);
+ int16x8_t tmp7 = vsubq_s16(col0, col7);
+ int16x8_t tmp1 = vaddq_s16(col1, col6);
+ int16x8_t tmp6 = vsubq_s16(col1, col6);
+ int16x8_t tmp2 = vaddq_s16(col2, col5);
+ int16x8_t tmp5 = vsubq_s16(col2, col5);
+ int16x8_t tmp3 = vaddq_s16(col3, col4);
+ int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+ int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+ int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+ col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ col4 = vsubq_s16(tmp10, tmp11);
+
+ int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ col2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ col6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+ col5 = vaddq_s16(z13, z2); /* phase 6 */
+ col3 = vsubq_s16(z13, z2);
+ col1 = vaddq_s16(z11, z4);
+ col7 = vsubq_s16(z11, z4);
+
+ /* Transpose to work on columns in pass 2. */
+ int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+ int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+ int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+ int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+ int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+ vreinterpretq_s32_s16(cols_45.val[0]));
+ int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+ vreinterpretq_s32_s16(cols_45.val[1]));
+ int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+ vreinterpretq_s32_s16(cols_67.val[0]));
+ int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+ vreinterpretq_s32_s16(cols_67.val[1]));
+
+ int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+ int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+ int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+ int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+ int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+ int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+ int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+ int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+ int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+ int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+ int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+ int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+ /* Pass 2: process columns. */
+
+ tmp0 = vaddq_s16(row0, row7);
+ tmp7 = vsubq_s16(row0, row7);
+ tmp1 = vaddq_s16(row1, row6);
+ tmp6 = vsubq_s16(row1, row6);
+ tmp2 = vaddq_s16(row2, row5);
+ tmp5 = vsubq_s16(row2, row5);
+ tmp3 = vaddq_s16(row3, row4);
+ tmp4 = vsubq_s16(row3, row4);
+
+ /* Even part */
+ tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ tmp13 = vsubq_s16(tmp0, tmp3);
+ tmp11 = vaddq_s16(tmp1, tmp2);
+ tmp12 = vsubq_s16(tmp1, tmp2);
+
+ row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ row4 = vsubq_s16(tmp10, tmp11);
+
+ z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ row2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ row6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ z13 = vsubq_s16(tmp7, z3);
+
+ row5 = vaddq_s16(z13, z2); /* phase 6 */
+ row3 = vsubq_s16(z13, z2);
+ row1 = vaddq_s16(z11, z4);
+ row7 = vsubq_s16(z11, z4);
+
+ vst1q_s16(data + 0 * DCTSIZE, row0);
+ vst1q_s16(data + 1 * DCTSIZE, row1);
+ vst1q_s16(data + 2 * DCTSIZE, row2);
+ vst1q_s16(data + 3 * DCTSIZE, row3);
+ vst1q_s16(data + 4 * DCTSIZE, row4);
+ vst1q_s16(data + 5 * DCTSIZE, row5);
+ vst1q_s16(data + 6 * DCTSIZE, row6);
+ vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jfdctint-neon.c b/media/libjpeg/simd/arm/jfdctint-neon.c
new file mode 100644
index 0000000000..ccfc07b15d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.298631336 = 2446 * 2^-13
+ * 0.390180644 = 3196 * 2^-13
+ * 0.541196100 = 4433 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.175875602 = 9633 * 2^-13
+ * 1.501321110 = 12299 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 1.961570560 = 16069 * 2^-13
+ * 2.053119869 = 16819 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ * 3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define F_0_298 2446
+#define F_0_390 3196
+#define F_0_541 4433
+#define F_0_765 6270
+#define F_0_899 7373
+#define F_1_175 9633
+#define F_1_501 12299
+#define F_1_847 15137
+#define F_1_961 16069
+#define F_2_053 16819
+#define F_2_562 20995
+#define F_3_072 25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+ F_0_298, -F_0_390, F_0_541, F_0_765,
+ -F_0_899, F_1_175, F_1_501, -F_1_847,
+ -F_1_961, F_2_053, -F_2_562, F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+ /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+ const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
+ * are used, followed by vuzp to transpose the block such that we have a
+ * column of samples per vector - allowing all rows to be processed at once.
+ */
+ int16x8x4_t s_rows_0123 = vld4q_s16(data);
+ int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+ int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+ int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+ int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+ int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+ int16x8_t col0 = cols_04.val[0];
+ int16x8_t col1 = cols_15.val[0];
+ int16x8_t col2 = cols_26.val[0];
+ int16x8_t col3 = cols_37.val[0];
+ int16x8_t col4 = cols_04.val[1];
+ int16x8_t col5 = cols_15.val[1];
+ int16x8_t col6 = cols_26.val[1];
+ int16x8_t col7 = cols_37.val[1];
+
+ /* Pass 1: process rows. */
+
+ int16x8_t tmp0 = vaddq_s16(col0, col7);
+ int16x8_t tmp7 = vsubq_s16(col0, col7);
+ int16x8_t tmp1 = vaddq_s16(col1, col6);
+ int16x8_t tmp6 = vsubq_s16(col1, col6);
+ int16x8_t tmp2 = vaddq_s16(col2, col5);
+ int16x8_t tmp5 = vsubq_s16(col2, col5);
+ int16x8_t tmp3 = vaddq_s16(col3, col4);
+ int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+ int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+ int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+ int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+ col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+ col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+ int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+ int32x4_t z1_l =
+ vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+ int32x4_t z1_h =
+ vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+ int32x4_t col2_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+ int32x4_t col2_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+ col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+ vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+ int32x4_t col6_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+ int32x4_t col6_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+ col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+ vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+ /* Odd part */
+ int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+ int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+ int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+ int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+ /* sqrt(2) * c3 */
+ int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+ int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+ z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+ z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+ /* sqrt(2) * (-c1+c3+c5-c7) */
+ int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+ int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+ /* sqrt(2) * ( c1+c3-c5+c7) */
+ int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+ int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+ /* sqrt(2) * ( c1+c3+c5-c7) */
+ int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+ int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+ /* sqrt(2) * ( c1+c3-c5-c7) */
+ int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+ int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+ /* sqrt(2) * (c7-c3) */
+ z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+ z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+ /* sqrt(2) * (-c1-c3) */
+ int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+ int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+ /* sqrt(2) * (-c3-c5) */
+ int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+ int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+ /* sqrt(2) * (c5-c3) */
+ int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+ int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+ z3_l = vaddq_s32(z3_l, z5_l);
+ z3_h = vaddq_s32(z3_h, z5_h);
+ z4_l = vaddq_s32(z4_l, z5_l);
+ z4_h = vaddq_s32(z4_h, z5_h);
+
+ tmp4_l = vaddq_s32(tmp4_l, z1_l);
+ tmp4_h = vaddq_s32(tmp4_h, z1_h);
+ tmp4_l = vaddq_s32(tmp4_l, z3_l);
+ tmp4_h = vaddq_s32(tmp4_h, z3_h);
+ col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+ vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+ tmp5_l = vaddq_s32(tmp5_l, z2_l);
+ tmp5_h = vaddq_s32(tmp5_h, z2_h);
+ tmp5_l = vaddq_s32(tmp5_l, z4_l);
+ tmp5_h = vaddq_s32(tmp5_h, z4_h);
+ col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+ vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+ tmp6_l = vaddq_s32(tmp6_l, z2_l);
+ tmp6_h = vaddq_s32(tmp6_h, z2_h);
+ tmp6_l = vaddq_s32(tmp6_l, z3_l);
+ tmp6_h = vaddq_s32(tmp6_h, z3_h);
+ col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+ vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+ tmp7_l = vaddq_s32(tmp7_l, z1_l);
+ tmp7_h = vaddq_s32(tmp7_h, z1_h);
+ tmp7_l = vaddq_s32(tmp7_l, z4_l);
+ tmp7_h = vaddq_s32(tmp7_h, z4_h);
+ col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+ vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+ /* Transpose to work on columns in pass 2. */
+ int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+ int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+ int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+ int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+ int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+ vreinterpretq_s32_s16(cols_45.val[0]));
+ int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+ vreinterpretq_s32_s16(cols_45.val[1]));
+ int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+ vreinterpretq_s32_s16(cols_67.val[0]));
+ int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+ vreinterpretq_s32_s16(cols_67.val[1]));
+
+ int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+ int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+ int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+ int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+ int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+ int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+ int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+ int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+ int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+ int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+ int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+ int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+ /* Pass 2: process columns. */
+
+ tmp0 = vaddq_s16(row0, row7);
+ tmp7 = vsubq_s16(row0, row7);
+ tmp1 = vaddq_s16(row1, row6);
+ tmp6 = vsubq_s16(row1, row6);
+ tmp2 = vaddq_s16(row2, row5);
+ tmp5 = vsubq_s16(row2, row5);
+ tmp3 = vaddq_s16(row3, row4);
+ tmp4 = vsubq_s16(row3, row4);
+
+ /* Even part */
+ tmp10 = vaddq_s16(tmp0, tmp3);
+ tmp13 = vsubq_s16(tmp0, tmp3);
+ tmp11 = vaddq_s16(tmp1, tmp2);
+ tmp12 = vsubq_s16(tmp1, tmp2);
+
+ row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+ row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+ tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+ z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+ z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+ int32x4_t row2_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+ int32x4_t row2_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+ row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+ vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+ int32x4_t row6_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+ int32x4_t row6_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+ row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+ vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+ /* Odd part */
+ z1 = vaddq_s16(tmp4, tmp7);
+ z2 = vaddq_s16(tmp5, tmp6);
+ z3 = vaddq_s16(tmp4, tmp6);
+ z4 = vaddq_s16(tmp5, tmp7);
+ /* sqrt(2) * c3 */
+ z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+ z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+ z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+ z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+ /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+ tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+ /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+ tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+ /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+ tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+ /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+ tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+ /* sqrt(2) * (c7-c3) */
+ z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+ z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+ /* sqrt(2) * (-c1-c3) */
+ z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+ z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+ /* sqrt(2) * (-c3-c5) */
+ z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+ z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+ /* sqrt(2) * (c5-c3) */
+ z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+ z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+ z3_l = vaddq_s32(z3_l, z5_l);
+ z3_h = vaddq_s32(z3_h, z5_h);
+ z4_l = vaddq_s32(z4_l, z5_l);
+ z4_h = vaddq_s32(z4_h, z5_h);
+
+ tmp4_l = vaddq_s32(tmp4_l, z1_l);
+ tmp4_h = vaddq_s32(tmp4_h, z1_h);
+ tmp4_l = vaddq_s32(tmp4_l, z3_l);
+ tmp4_h = vaddq_s32(tmp4_h, z3_h);
+ row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+ vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+ tmp5_l = vaddq_s32(tmp5_l, z2_l);
+ tmp5_h = vaddq_s32(tmp5_h, z2_h);
+ tmp5_l = vaddq_s32(tmp5_l, z4_l);
+ tmp5_h = vaddq_s32(tmp5_h, z4_h);
+ row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+ vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+ tmp6_l = vaddq_s32(tmp6_l, z2_l);
+ tmp6_h = vaddq_s32(tmp6_h, z2_h);
+ tmp6_l = vaddq_s32(tmp6_l, z3_l);
+ tmp6_h = vaddq_s32(tmp6_h, z3_h);
+ row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+ vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+ tmp7_l = vaddq_s32(tmp7_l, z1_l);
+ tmp7_h = vaddq_s32(tmp7_h, z1_h);
+ tmp7_l = vaddq_s32(tmp7_l, z4_l);
+ tmp7_h = vaddq_s32(tmp7_h, z4_h);
+ row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+ vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+ vst1q_s16(data + 0 * DCTSIZE, row0);
+ vst1q_s16(data + 1 * DCTSIZE, row1);
+ vst1q_s16(data + 2 * DCTSIZE, row2);
+ vst1q_s16(data + 3 * DCTSIZE, row3);
+ vst1q_s16(data + 4 * DCTSIZE, row4);
+ vst1q_s16(data + 5 * DCTSIZE, row5);
+ vst1q_s16(data + 6 * DCTSIZE, row6);
+ vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jidctfst-neon.c b/media/libjpeg/simd/arm/jidctfst-neon.c
new file mode 100644
index 0000000000..a91be5362e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.082392200 = 2688 * 2^-15
+ * 0.414213562 = 13568 * 2^-15
+ * 0.847759065 = 27776 * 2^-15
+ * 0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS 2
+
+#define F_0_082 2688
+#define F_0_414 13568
+#define F_0_847 27776
+#define F_0_613 20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+ F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ IFAST_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values for DC coefficients. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ /* Dequantize DC coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+
+ /* Construct bitmap to test if all AC coefficients are 0. */
+ int16x8_t bitmap = vorrq_s16(row1, row2);
+ bitmap = vorrq_s16(bitmap, row3);
+ bitmap = vorrq_s16(bitmap, row4);
+ bitmap = vorrq_s16(bitmap, row5);
+ bitmap = vorrq_s16(bitmap, row6);
+ bitmap = vorrq_s16(bitmap, row7);
+
+ int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+ int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+ /* Load IDCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+ if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+ /* All AC coefficients are zero.
+ * Compute DC values and duplicate into vectors.
+ */
+ int16x8_t dcval = row0;
+ row1 = dcval;
+ row2 = dcval;
+ row3 = dcval;
+ row4 = dcval;
+ row5 = dcval;
+ row6 = dcval;
+ row7 = dcval;
+ } else if (left_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 0, 1, 2, and 3.
+ * Use DC values for these columns.
+ */
+ int16x4_t dcval = vget_low_s16(row0);
+
+ /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x4_t tmp0 = vget_high_s16(row0);
+ int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+ int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+ int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+ int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
+ int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+ int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+ int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+ tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsub_s16(tmp12, tmp13);
+
+ tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsub_s16(tmp10, tmp13);
+ tmp1 = vadd_s16(tmp11, tmp12);
+ tmp2 = vsub_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+ int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+ int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+ int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+ int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
+ int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+ int16x4_t z11 = vadd_s16(tmp4, tmp7);
+ int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+ tmp7 = vadd_s16(z11, z13); /* phase 5 */
+ int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+ tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+ int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+ int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+ z5 = vadd_s16(z5, z10_add_z12);
+ tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+ tmp10 = vadd_s16(tmp10, z12);
+ tmp10 = vsub_s16(tmp10, z5);
+ tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+ tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+ tmp12 = vadd_s16(tmp12, z5);
+
+ tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsub_s16(tmp11, tmp6);
+ tmp4 = vadd_s16(tmp10, tmp5);
+
+ row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+ row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+ row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+ row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+ row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+ row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+ row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+ row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+ } else if (right_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 4, 5, 6, and 7.
+ * Use DC values for these columns.
+ */
+ int16x4_t dcval = vget_high_s16(row0);
+
+ /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x4_t tmp0 = vget_low_s16(row0);
+ int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+ int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+ int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+ int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
+ int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+ int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+ int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+ tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsub_s16(tmp12, tmp13);
+
+ tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsub_s16(tmp10, tmp13);
+ tmp1 = vadd_s16(tmp11, tmp12);
+ tmp2 = vsub_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+ int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+ int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+ int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+ int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
+ int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+ int16x4_t z11 = vadd_s16(tmp4, tmp7);
+ int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+ tmp7 = vadd_s16(z11, z13); /* phase 5 */
+ int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+ tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+ int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+ int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+ z5 = vadd_s16(z5, z10_add_z12);
+ tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+ tmp10 = vadd_s16(tmp10, z12);
+ tmp10 = vsub_s16(tmp10, z5);
+ tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+ tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+ tmp12 = vadd_s16(tmp12, z5);
+
+ tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsub_s16(tmp11, tmp6);
+ tmp4 = vadd_s16(tmp10, tmp5);
+
+ row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+ row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+ row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+ row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+ row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+ row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+ row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+ row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+ } else {
+ /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+ /* Load quantization table. */
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x8_t tmp0 = row0;
+ int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+ int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+ int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
+ int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+ int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+ int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+ tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsubq_s16(tmp12, tmp13);
+
+ tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsubq_s16(tmp10, tmp13);
+ tmp1 = vaddq_s16(tmp11, tmp12);
+ tmp2 = vsubq_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+ int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+ int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+ int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+ int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
+ int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+ int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+ int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+ tmp7 = vaddq_s16(z11, z13); /* phase 5 */
+ int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+ tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+ int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+ int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+ z5 = vaddq_s16(z5, z10_add_z12);
+ tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+ tmp10 = vaddq_s16(tmp10, z12);
+ tmp10 = vsubq_s16(tmp10, z5);
+ tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+ tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+ tmp12 = vaddq_s16(tmp12, z5);
+
+ tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsubq_s16(tmp11, tmp6);
+ tmp4 = vaddq_s16(tmp10, tmp5);
+
+ row0 = vaddq_s16(tmp0, tmp7);
+ row7 = vsubq_s16(tmp0, tmp7);
+ row1 = vaddq_s16(tmp1, tmp6);
+ row6 = vsubq_s16(tmp1, tmp6);
+ row2 = vaddq_s16(tmp2, tmp5);
+ row5 = vsubq_s16(tmp2, tmp5);
+ row4 = vaddq_s16(tmp3, tmp4);
+ row3 = vsubq_s16(tmp3, tmp4);
+ }
+
+ /* Transpose rows to work on columns in pass 2. */
+ int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+ int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+ int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+ int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+ int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+ vreinterpretq_s32_s16(rows_45.val[0]));
+ int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+ vreinterpretq_s32_s16(rows_45.val[1]));
+ int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+ vreinterpretq_s32_s16(rows_67.val[0]));
+ int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+ vreinterpretq_s32_s16(rows_67.val[1]));
+
+ int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+ int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+ int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+ int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+ int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+ int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+ int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+ int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+ int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+ int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+ int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+ int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+ /* 1-D IDCT, pass 2 */
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(col0, col4);
+ int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+ int16x8_t tmp13 = vaddq_s16(col2, col6);
+ int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+ int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+ tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+ tmp12 = vsubq_s16(tmp12, tmp13);
+
+ int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+ int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+ int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+ int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+ /* Odd part */
+ int16x8_t z13 = vaddq_s16(col5, col3);
+ int16x8_t neg_z10 = vsubq_s16(col3, col5);
+ int16x8_t z11 = vaddq_s16(col1, col7);
+ int16x8_t z12 = vsubq_s16(col1, col7);
+
+ int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
+ int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+ tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+ int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+ int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+ z5 = vaddq_s16(z5, z10_add_z12);
+ tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+ tmp10 = vaddq_s16(tmp10, z12);
+ tmp10 = vsubq_s16(tmp10, z5);
+ tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+ tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+ tmp12 = vaddq_s16(tmp12, z5);
+
+ int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
+ int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+ int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+ col0 = vaddq_s16(tmp0, tmp7);
+ col7 = vsubq_s16(tmp0, tmp7);
+ col1 = vaddq_s16(tmp1, tmp6);
+ col6 = vsubq_s16(tmp1, tmp6);
+ col2 = vaddq_s16(tmp2, tmp5);
+ col5 = vsubq_s16(tmp2, tmp5);
+ col4 = vaddq_s16(tmp3, tmp4);
+ col3 = vsubq_s16(tmp3, tmp4);
+
+ /* Scale down by a factor of 8, narrowing to 8-bit. */
+ int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+ vqshrn_n_s16(col1, PASS1_BITS + 3));
+ int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+ vqshrn_n_s16(col5, PASS1_BITS + 3));
+ int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+ vqshrn_n_s16(col3, PASS1_BITS + 3));
+ int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+ vqshrn_n_s16(col7, PASS1_BITS + 3));
+ /* Clamp to range [0-255]. */
+ uint8x16_t cols_01 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_45 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_23 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_67 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+ /* Transpose block to prepare for store. */
+ uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+ vreinterpretq_u32_u8(cols_45));
+ uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+ vreinterpretq_u32_u8(cols_67));
+
+ uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+ vreinterpretq_u8_u32(cols_0415.val[1]));
+ uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+ vreinterpretq_u8_u32(cols_2637.val[1]));
+ uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+ vreinterpretq_u16_u8(cols_2367.val[0]));
+ uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+ vreinterpretq_u16_u8(cols_2367.val[1]));
+
+ uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+ uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+ uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+ uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+ JSAMPROW outptr0 = output_buf[0] + output_col;
+ JSAMPROW outptr1 = output_buf[1] + output_col;
+ JSAMPROW outptr2 = output_buf[2] + output_col;
+ JSAMPROW outptr3 = output_buf[3] + output_col;
+ JSAMPROW outptr4 = output_buf[4] + output_col;
+ JSAMPROW outptr5 = output_buf[5] + output_col;
+ JSAMPROW outptr6 = output_buf[6] + output_col;
+ JSAMPROW outptr7 = output_buf[7] + output_col;
+
+ /* Store DCT block to memory. */
+ vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+ vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+ vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+ vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+ vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+ vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+ vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+ vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/media/libjpeg/simd/arm/jidctint-neon.c b/media/libjpeg/simd/arm/jidctint-neon.c
new file mode 100644
index 0000000000..043b652e6c
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctint-neon.c
@@ -0,0 +1,802 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time. Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ * 0.298631336 = 2446 * 2^-13
+ * 0.390180644 = 3196 * 2^-13
+ * 0.541196100 = 4433 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.175875602 = 9633 * 2^-13
+ * 1.501321110 = 12299 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 1.961570560 = 16069 * 2^-13
+ * 2.053119869 = 16819 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ * 3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298 2446
+#define F_0_390 3196
+#define F_0_541 4433
+#define F_0_765 6270
+#define F_0_899 7373
+#define F_1_175 9633
+#define F_1_501 12299
+#define F_1_847 15137
+#define F_1_961 16069
+#define F_2_053 16819
+#define F_2_562 20995
+#define F_3_072 25172
+
+#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+ F_0_899, F_0_541,
+ F_2_562, F_0_298_MINUS_0_899,
+ F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+ F_0_541_PLUS_0_765, F_1_175,
+ F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+ F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+ 0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients. For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block. This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients. Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.) Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ * +-------+-------+ +-------+-------+
+ * | | | | | |
+ * | 0 | 1 | | 0 | 2 |
+ * | | | transpose | | |
+ * +-------+-------+ ------> +-------+-------+
+ * | | | | | |
+ * | 2 | 3 | | 1 | 3 |
+ * | | | | | |
+ * +-------+-------+ +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right. If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified. On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero. In this case, we perform a
+ * "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT
+ * result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero. In this case, the inverse DCT
+ * result is all zero. For the left 4x8 half, this is handled identically
+ * to Case 2 above. For the right 4x8 half, we do no work and signal that
+ * the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.) If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block. (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ int16_t workspace_l[8 * DCTSIZE / 2];
+ int16_t workspace_r[8 * DCTSIZE / 2];
+
+ /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+ /* Load DCT coefficients in left 4x8 block. */
+ int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+ int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+ int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+ int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+ int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+ int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+ int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+ int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table for left 4x8 block. */
+ int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+ int16x4_t bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (bitmap_rows_4567 == 0) {
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (left_ac_bitmap == 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l, quadrant);
+ vst4_s16(workspace_r, quadrant);
+ } else {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l, workspace_r);
+ }
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l, workspace_r);
+ }
+
+ /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+ /* Load DCT coefficients in right 4x8 block. */
+ row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+ row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+ row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+ row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+ row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+ row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+ row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+ row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+ /* Load quantization table for right 4x8 block. */
+ quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+ quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+ quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+ bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ /* If this remains non-zero, a "regular" second pass will be performed. */
+ int64_t right_ac_dc_bitmap = 1;
+
+ if (right_ac_bitmap == 0) {
+ bitmap = vorr_s16(bitmap, row0);
+ right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (right_ac_dc_bitmap != 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+ vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+ }
+ } else {
+ if (bitmap_rows_4567 == 0) {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ }
+ }
+
+ /* Second pass: compute IDCT on rows in workspace. */
+
+ /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+ if (right_ac_dc_bitmap == 0) {
+ jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+ } else {
+ jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+ }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients. (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c. Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+ int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ z3_s16 = vmul_s16(row4, quant_row4);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part */
+ int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+ int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { {
+ vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ } };
+ int16x4x4_t rows_4567 = { {
+ vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ } };
+
+ /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+ * (VST4 transposes the blocks. We need to operate on rows in the next
+ * pass.)
+ */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0. This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part (z3 is all 0) */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part (tmp0 and tmp1 are both all 0) */
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { {
+ vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ } };
+ int16x4x4_t rows_4567 = { {
+ vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ } };
+
+ /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+ * (VST4 transposes the blocks. We need to operate on rows in the next
+ * pass.)
+ */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients. (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c. Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+ int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part */
+ int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+ int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
+ * together allows us to store 16-bit elements.)
+ */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { {
+ vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ } };
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0. This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part (z3 is all 0) */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part (tmp0 and tmp1 are both all 0) */
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
+ * together allows us to store 16-bit elements.)
+ */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { {
+ vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ } };
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/media/libjpeg/simd/arm/jidctred-neon.c b/media/libjpeg/simd/arm/jidctred-neon.c
new file mode 100644
index 0000000000..be9627e61d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define F_0_211 1730
+#define F_0_509 4176
+#define F_0_601 4926
+#define F_0_720 5906
+#define F_0_765 6270
+#define F_0_850 6967
+#define F_0_899 7373
+#define F_1_061 8697
+#define F_1_272 10426
+#define F_1_451 11893
+#define F_1_847 15137
+#define F_2_172 17799
+#define F_2_562 20995
+#define F_3_624 29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.720959822 = 5906 * 2^-13
+ * 0.850430095 = 6967 * 2^-13
+ * 1.272758580 = 10426 * 2^-13
+ * 3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm. Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+ -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Dequantize DCT coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+ row1 = vmulq_s16(row1, quant_row1);
+ row3 = vmulq_s16(row3, quant_row3);
+ row5 = vmulq_s16(row5, quant_row5);
+ row7 = vmulq_s16(row7, quant_row7);
+
+ /* Load IDCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+ /* Pass 1: process columns from input, put results in vectors row0 and
+ * row1.
+ */
+
+ /* Even part */
+ int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+ int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+ /* Odd part */
+ int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+ int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+ row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+ /* Transpose two rows, ready for second pass. */
+ int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+ int16x8_t cols_0246 = cols_0246_1357.val[0];
+ int16x8_t cols_1357 = cols_0246_1357.val[1];
+ /* Duplicate columns such that each is accessible in its own vector. */
+ int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+ vreinterpretq_s32_s16(cols_1357));
+ int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+ int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+ /* Pass 2: process two rows, store to output array. */
+
+ /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+ * care."
+ */
+ int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+ /* Odd part: we're only interested in the bottom half of tmp0. */
+ int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+ tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+ tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+ tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+ int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+ vsubhn_s32(tmp10, tmp0));
+ output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+ CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+ /* Narrow to 8-bit and convert to unsigned. */
+ uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+ /* Store 2x2 block to memory. */
+ vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+ vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+ vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+ vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.211164243 = 1730 * 2^-13
+ * 0.509795579 = 4176 * 2^-13
+ * 0.601344887 = 4926 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.061594337 = 8697 * 2^-13
+ * 1.451774981 = 11893 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 2.172734803 = 17799 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm. Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+ F_1_847, -F_0_765, -F_0_211, F_1_451,
+ -F_2_172, F_1_061, -F_0_509, -F_0_601,
+ F_0_899, F_2_562, 0, 0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values for DC coefficients. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ /* Dequantize DC coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+
+ /* Construct bitmap to test if all AC coefficients are 0. */
+ int16x8_t bitmap = vorrq_s16(row1, row2);
+ bitmap = vorrq_s16(bitmap, row3);
+ bitmap = vorrq_s16(bitmap, row5);
+ bitmap = vorrq_s16(bitmap, row6);
+ bitmap = vorrq_s16(bitmap, row7);
+
+ int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+ int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+ /* All AC coefficients are zero.
+ * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+ */
+ int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+ row0 = dcval;
+ row1 = dcval;
+ row2 = dcval;
+ row3 = dcval;
+ } else if (left_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 0, 1, 2, and 3.
+ * Compute DC values for these columns.
+ */
+ int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+ /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Even part */
+ int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+ int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+ int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part */
+ int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+ z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+ z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+ int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+ tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1));
+ row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1));
+ row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1));
+ row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1));
+ } else if (right_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 4, 5, 6, and 7.
+ * Compute DC values for these columns.
+ */
+ int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+ /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part */
+ int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+ int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+ int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part */
+ int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+ z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+ z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+ int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+ tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ } else {
+ /* All AC coefficients are non-zero; full IDCT calculation required. */
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part */
+ int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+ int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+ int16x8_t z2 = vmulq_s16(row2, quant_row2);
+ int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+ int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+ int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+ int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+ int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+ int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+ int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+ /* Odd part */
+ int16x8_t z1 = vmulq_s16(row7, quant_row7);
+ z2 = vmulq_s16(row5, quant_row5);
+ z3 = vmulq_s16(row3, quant_row3);
+ int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+ tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+ tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+ tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+ tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+ CONST_BITS - PASS1_BITS + 1));
+ }
+
+ /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+ int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+ int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+ int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+ vreinterpretq_s32_s16(row_23.val[0]));
+ int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+ vreinterpretq_s32_s16(row_23.val[1]));
+
+ int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+ int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+ int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+ int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+ int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+ int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+ int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+ /* Commence second pass of IDCT. */
+
+ /* Even part */
+ int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+ int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part */
+ tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+ int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+ vsubhn_s32(tmp12, tmp0));
+ int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+ vsubhn_s32(tmp10, tmp2));
+ output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+ CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+ output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+ CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+ /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+ * An interleaving store completes the transpose.
+ */
+ uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+ vqmovun_s16(output_cols_13));
+ uint16x4x2_t output_01_23 = { {
+ vreinterpret_u16_u8(output_0123.val[0]),
+ vreinterpret_u16_u8(output_0123.val[1])
+ } };
+
+ /* Store 4x4 block to memory. */
+ JSAMPROW outptr0 = output_buf[0] + output_col;
+ JSAMPROW outptr1 = output_buf[1] + output_col;
+ JSAMPROW outptr2 = output_buf[2] + output_col;
+ JSAMPROW outptr3 = output_buf[3] + output_col;
+ vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+ vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+ vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+ vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/media/libjpeg/simd/arm/jquanti-neon.c b/media/libjpeg/simd/arm/jquanti-neon.c
new file mode 100644
index 0000000000..d5d95d89f6
--- /dev/null
+++ b/media/libjpeg/simd/arm/jquanti-neon.c
@@ -0,0 +1,193 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+ uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+ uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+ uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+ uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+ uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+ uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+ uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+ int16x8_t row0 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row1 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row2 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row3 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row4 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row5 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row6 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row7 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+ vst1q_s16(workspace + 0 * DCTSIZE, row0);
+ vst1q_s16(workspace + 1 * DCTSIZE, row1);
+ vst1q_s16(workspace + 2 * DCTSIZE, row2);
+ vst1q_s16(workspace + 3 * DCTSIZE, row3);
+ vst1q_s16(workspace + 4 * DCTSIZE, row4);
+ vst1q_s16(workspace + 5 * DCTSIZE, row5);
+ vst1q_s16(workspace + 6 * DCTSIZE, row6);
+ vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace)
+{
+ JCOEFPTR out_ptr = coef_block;
+ UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+ UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+ DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+ int i;
+
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
+ for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+ /* Load reciprocals of quantization values. */
+ uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+ uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+ int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+ int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+ int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+ int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+ /* Extract sign from coefficients. */
+ int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+ int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+ int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+ int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+ /* Get absolute value of DCT coefficients. */
+ uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+ uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+ uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+ uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+ /* Add correction. */
+ abs_row0 = vaddq_u16(abs_row0, corr0);
+ abs_row1 = vaddq_u16(abs_row1, corr1);
+ abs_row2 = vaddq_u16(abs_row2, corr2);
+ abs_row3 = vaddq_u16(abs_row3, corr3);
+
+ /* Multiply DCT coefficients by quantization reciprocals. */
+ int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+ vget_low_u16(recip0)));
+ int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+ vget_high_u16(recip0)));
+ int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+ vget_low_u16(recip1)));
+ int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+ vget_high_u16(recip1)));
+ int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+ vget_low_u16(recip2)));
+ int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+ vget_high_u16(recip2)));
+ int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+ vget_low_u16(recip3)));
+ int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+ vget_high_u16(recip3)));
+ /* Narrow back to 16-bit. */
+ row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+ row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+ row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+ row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+ /* Since VSHR only supports an immediate as its second argument, negate the
+ * shift value and shift left.
+ */
+ row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+ vnegq_s16(shift0)));
+ row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+ vnegq_s16(shift1)));
+ row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+ vnegq_s16(shift2)));
+ row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+ vnegq_s16(shift3)));
+
+ /* Restore sign to original product. */
+ row0 = veorq_s16(row0, sign_row0);
+ row0 = vsubq_s16(row0, sign_row0);
+ row1 = veorq_s16(row1, sign_row1);
+ row1 = vsubq_s16(row1, sign_row1);
+ row2 = veorq_s16(row2, sign_row2);
+ row2 = vsubq_s16(row2, sign_row2);
+ row3 = veorq_s16(row3, sign_row3);
+ row3 = vsubq_s16(row3, sign_row3);
+
+ /* Store quantized coefficients to memory. */
+ vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+ vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+ vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+ vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+ }
+}
diff --git a/media/libjpeg/simd/arm/neon-compat.h b/media/libjpeg/simd/arm/neon-compat.h
new file mode 100644
index 0000000000..2907634e26
--- /dev/null
+++ b/media/libjpeg/simd/arm/neon-compat.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
+#define BUILTIN_CLZLL(x) _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP64(x) _byteswap_uint64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x) __builtin_clz(x)
+#define BUILTIN_CLZLL(x) __builtin_clzll(x)
+#define BUILTIN_BSWAP64(x) __builtin_bswap64(x)
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/i386/jccolext-avx2.asm b/media/libjpeg/simd/i386/jccolext-avx2.asm
new file mode 100644
index 0000000000..c46d684436
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-avx2.asm
@@ -0,0 +1,578 @@
+;
+; jccolext.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ vmovd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ vmovd xmmF, XMM_DWORD [esi+ecx]
+ vpslldq xmmA, xmmA, SIZEOF_DWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ vmovq xmmB, XMM_MMWORD [esi+ecx]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ sub ecx, byte SIZEOF_XMMWORD
+ vmovdqu xmmB, XMM_MMWORD [esi+ecx]
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ vpor ymmA, ymmB
+.column_ld32:
+ test cl, SIZEOF_YMMWORD
+ jz short .column_ld64
+ sub ecx, byte SIZEOF_YMMWORD
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+ test cl, 2*SIZEOF_YMMWORD
+ mov ecx, SIZEOF_YMMWORD
+ jz short .rgb_ycc_cnv
+ vmovdqa ymmB, ymmA
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+ ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vmovdqu ymmC, ymmA
+ vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+ vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+ vmovdqa ymmG, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+ ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+ vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+ ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+ ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+ vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+ ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+ vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+ ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+ vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+ ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+ vmovdqa ymmD, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+ ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+ vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+ ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+ vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+ ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+ ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+ ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+ vmovdqa ymmE, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+ ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+ vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+ ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+ ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+ ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+ vpxor ymmH, ymmH, ymmH
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmB, ymmE
+ vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+ vmovdqa ymmF, ymmD
+ vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ vmovdqa xmmF, xmmA
+ vperm2i128 ymmF, ymmF, ymmF, 1
+ vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+ vpor ymmA, ymmA, ymmF
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_XMMWORD/2
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ mov ecx, SIZEOF_YMMWORD
+ jz short .rgb_ycc_cnv
+ vmovdqa ymmE, ymmA
+ vmovdqa ymmH, ymmF
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+ vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+ ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmB, ymmA
+ vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+ vmovdqa ymmB, ymmF
+ vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+ vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmD, ymmA
+ vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+ ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+ vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+ ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+ vmovdqa ymmC, ymmF
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+ ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+ ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+ vmovdqa ymmB, ymmA
+ vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+ vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+ ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+ vmovdqa ymmG, ymmD
+ vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+ ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+ vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+ ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+ vmovdqa ymmE, ymmA
+ vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vmovdqa ymmH, ymmB
+ vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+ ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+ ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+ vpxor ymmF, ymmF, ymmF
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmD, ymmB
+ vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+ vmovdqa ymmG, ymmE
+ vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vpunpcklbw ymmF, ymmF, ymmH
+ vpunpckhbw ymmH, ymmH, ymmH
+ vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+ vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+ ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE
+ vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO
+ vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE
+ vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO
+
+ vmovdqa ymm6, ymm1
+ vpunpcklwd ymm1, ymm1, ymm3
+ vpunpckhwd ymm6, ymm6, ymm3
+ vmovdqa ymm7, ymm1
+ vmovdqa ymm4, ymm6
+ vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vpxor ymm1, ymm1, ymm1
+ vpxor ymm6, ymm6, ymm6
+ vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL
+ vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH
+ vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500)
+ vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500)
+
+ vmovdqa ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm5=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm7, ymm7, ymm1
+ vpaddd ymm4, ymm4, ymm6
+ vpaddd ymm7, ymm7, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH
+ vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO
+
+ vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE
+
+ vmovdqa ymm6, ymm0
+ vpunpcklwd ymm0, ymm0, ymm2
+ vpunpckhwd ymm6, ymm6, ymm2
+ vmovdqa ymm5, ymm0
+ vmovdqa ymm4, ymm6
+ vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vpxor ymm0, ymm0, ymm0
+ vpxor ymm6, ymm6, ymm6
+ vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL
+ vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH
+ vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500)
+ vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500)
+
+ vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm5, ymm5, ymm0
+ vpaddd ymm4, ymm4, ymm6
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm4, ymm4, ymm1
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH
+ vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpor ymm5, ymm5, ymm7 ; ymm5=Cb
+ vmovdqu YMMWORD [ebx], ymm5 ; Save Cb
+
+ vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO
+ vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE
+ vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO
+
+ vmovdqa ymm4, ymm0
+ vpunpcklwd ymm0, ymm0, ymm3
+ vpunpckhwd ymm4, ymm4, ymm3
+ vmovdqa ymm7, ymm0
+ vmovdqa ymm5, ymm4
+ vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF]
+
+ vpaddd ymm0, ymm0, YMMWORD [wk(4)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(5)]
+ vpaddd ymm0, ymm0, ymm3
+ vpaddd ymm4, ymm4, ymm3
+ vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
+ vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
+
+ vpxor ymm3, ymm3, ymm3
+ vpxor ymm4, ymm4, ymm4
+ vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL
+ vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH
+ vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500)
+ vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500)
+
+ vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm7, ymm7, ymm3
+ vpaddd ymm5, ymm5, ymm4
+ vpaddd ymm7, ymm7, ymm1
+ vpaddd ymm5, ymm5, ymm1
+ vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH
+ vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO
+
+ vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE
+
+ vmovdqa ymm4, ymm6
+ vpunpcklwd ymm6, ymm6, ymm2
+ vpunpckhwd ymm4, ymm4, ymm2
+ vmovdqa ymm1, ymm6
+ vmovdqa ymm5, ymm4
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF]
+
+ vpaddd ymm6, ymm6, YMMWORD [wk(6)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(7)]
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm4, ymm4, ymm2
+ vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
+ vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpor ymm6, ymm6, ymm0 ; ymm6=Y
+ vmovdqu YMMWORD [edi], ymm6 ; Save Y
+
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm4, ymm4, ymm4
+ vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL
+ vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH
+ vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500)
+ vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500)
+
+ vmovdqa ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm0=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm1, ymm1, ymm2
+ vpaddd ymm5, ymm5, ymm4
+ vpaddd ymm1, ymm1, ymm0
+ vpaddd ymm5, ymm5, ymm0
+ vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH
+ vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpor ymm1, ymm1, ymm7 ; ymm1=Cr
+ vmovdqu YMMWORD [edx], ymm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_YMMWORD
+ add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
+ add edi, byte SIZEOF_YMMWORD ; outptr0
+ add ebx, byte SIZEOF_YMMWORD ; outptr1
+ add edx, byte SIZEOF_YMMWORD ; outptr2
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jccolext-mmx.asm b/media/libjpeg/simd/i386/jccolext-mmx.asm
new file mode 100644
index 0000000000..6357a42b2c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-mmx.asm
@@ -0,0 +1,476 @@
+;
+; jccolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ xor eax, eax
+ mov al, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ xor edx, edx
+ mov dx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ movd mmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd mmG, dword [esi+ecx]
+ psllq mmA, DWORD_BIT
+ por mmA, mmG
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ movq mmG, mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ mov ecx, SIZEOF_MMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld16:
+ test cl, 2*SIZEOF_MMWORD
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_ycc_cnv
+ movq mmF, mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+ ; mmA=(00 10 20 01 11 21 02 12)
+ ; mmG=(22 03 13 23 04 14 24 05)
+ ; mmF=(15 25 06 16 26 07 17 27)
+
+ movq mmD, mmA
+ psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
+ psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
+
+ punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05)
+ psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
+
+ punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16)
+ punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27)
+
+ movq mmE, mmA
+ psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
+ psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
+
+ punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
+ psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
+
+ punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07)
+ punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27)
+
+ pxor mmH, mmH
+
+ movq mmC, mmA
+ punpcklbw mmA, mmH ; mmA=(00 02 04 06)
+ punpckhbw mmC, mmH ; mmC=(10 12 14 16)
+
+ movq mmB, mmE
+ punpcklbw mmE, mmH ; mmE=(20 22 24 26)
+ punpckhbw mmB, mmH ; mmB=(01 03 05 07)
+
+ movq mmF, mmD
+ punpcklbw mmD, mmH ; mmD=(11 13 15 17)
+ punpckhbw mmF, mmH ; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_MMWORD/8
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_MMWORD/8
+ movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_MMWORD/4
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_MMWORD/4
+ movq mmF, mmA
+ movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+ test cl, SIZEOF_MMWORD/2
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_ycc_cnv
+ movq mmD, mmA
+ movq mmC, mmF
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+ movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+ ; mmA=(00 10 20 30 01 11 21 31)
+ ; mmF=(02 12 22 32 03 13 23 33)
+ ; mmD=(04 14 24 34 05 15 25 35)
+ ; mmC=(06 16 26 36 07 17 27 37)
+
+ movq mmB, mmA
+ punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32)
+ punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33)
+
+ movq mmG, mmD
+ punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36)
+ punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37)
+
+ movq mmE, mmA
+ punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
+ punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36)
+
+ movq mmH, mmB
+ punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17)
+ punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37)
+
+ pxor mmF, mmF
+
+ movq mmC, mmA
+ punpcklbw mmA, mmF ; mmA=(00 02 04 06)
+ punpckhbw mmC, mmF ; mmC=(10 12 14 16)
+
+ movq mmD, mmB
+ punpcklbw mmB, mmF ; mmB=(01 03 05 07)
+ punpckhbw mmD, mmF ; mmD=(11 13 15 17)
+
+ movq mmG, mmE
+ punpcklbw mmE, mmF ; mmE=(20 22 24 26)
+ punpckhbw mmG, mmF ; mmG=(30 32 34 36)
+
+ punpcklbw mmF, mmH
+ punpckhbw mmH, mmH
+ psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27)
+ psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+ ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movq MMWORD [wk(0)], mm0 ; wk(0)=RE
+ movq MMWORD [wk(1)], mm1 ; wk(1)=RO
+ movq MMWORD [wk(2)], mm4 ; wk(2)=BE
+ movq MMWORD [wk(3)], mm5 ; wk(3)=BO
+
+ movq mm6, mm1
+ punpcklwd mm1, mm3
+ punpckhwd mm6, mm3
+ movq mm7, mm1
+ movq mm4, mm6
+ pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd mm7, [GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor mm1, mm1
+ pxor mm6, mm6
+ punpcklwd mm1, mm5 ; mm1=BOL
+ punpckhwd mm6, mm5 ; mm6=BOH
+ psrld mm1, 1 ; mm1=BOL*FIX(0.500)
+ psrld mm6, 1 ; mm6=BOH*FIX(0.500)
+
+ movq mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+ paddd mm7, mm1
+ paddd mm4, mm6
+ paddd mm7, mm5
+ paddd mm4, mm5
+ psrld mm7, SCALEBITS ; mm7=CbOL
+ psrld mm4, SCALEBITS ; mm4=CbOH
+ packssdw mm7, mm4 ; mm7=CbO
+
+ movq mm1, MMWORD [wk(2)] ; mm1=BE
+
+ movq mm6, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm6, mm2
+ movq mm5, mm0
+ movq mm4, mm6
+ pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd mm5, [GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor mm0, mm0
+ pxor mm6, mm6
+ punpcklwd mm0, mm1 ; mm0=BEL
+ punpckhwd mm6, mm1 ; mm6=BEH
+ psrld mm0, 1 ; mm0=BEL*FIX(0.500)
+ psrld mm6, 1 ; mm6=BEH*FIX(0.500)
+
+ movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+ paddd mm5, mm0
+ paddd mm4, mm6
+ paddd mm5, mm1
+ paddd mm4, mm1
+ psrld mm5, SCALEBITS ; mm5=CbEL
+ psrld mm4, SCALEBITS ; mm4=CbEH
+ packssdw mm5, mm4 ; mm5=CbE
+
+ psllw mm7, BYTE_BIT
+ por mm5, mm7 ; mm5=Cb
+ movq MMWORD [ebx], mm5 ; Save Cb
+
+ movq mm0, MMWORD [wk(3)] ; mm0=BO
+ movq mm6, MMWORD [wk(2)] ; mm6=BE
+ movq mm1, MMWORD [wk(1)] ; mm1=RO
+
+ movq mm4, mm0
+ punpcklwd mm0, mm3
+ punpckhwd mm4, mm3
+ movq mm7, mm0
+ movq mm5, mm4
+ pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd mm7, [GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
+
+ paddd mm0, MMWORD [wk(4)]
+ paddd mm4, MMWORD [wk(5)]
+ paddd mm0, mm3
+ paddd mm4, mm3
+ psrld mm0, SCALEBITS ; mm0=YOL
+ psrld mm4, SCALEBITS ; mm4=YOH
+ packssdw mm0, mm4 ; mm0=YO
+
+ pxor mm3, mm3
+ pxor mm4, mm4
+ punpcklwd mm3, mm1 ; mm3=ROL
+ punpckhwd mm4, mm1 ; mm4=ROH
+ psrld mm3, 1 ; mm3=ROL*FIX(0.500)
+ psrld mm4, 1 ; mm4=ROH*FIX(0.500)
+
+ movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+ paddd mm7, mm3
+ paddd mm5, mm4
+ paddd mm7, mm1
+ paddd mm5, mm1
+ psrld mm7, SCALEBITS ; mm7=CrOL
+ psrld mm5, SCALEBITS ; mm5=CrOH
+ packssdw mm7, mm5 ; mm7=CrO
+
+ movq mm3, MMWORD [wk(0)] ; mm3=RE
+
+ movq mm4, mm6
+ punpcklwd mm6, mm2
+ punpckhwd mm4, mm2
+ movq mm1, mm6
+ movq mm5, mm4
+ pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd mm1, [GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
+
+ paddd mm6, MMWORD [wk(6)]
+ paddd mm4, MMWORD [wk(7)]
+ paddd mm6, mm2
+ paddd mm4, mm2
+ psrld mm6, SCALEBITS ; mm6=YEL
+ psrld mm4, SCALEBITS ; mm4=YEH
+ packssdw mm6, mm4 ; mm6=YE
+
+ psllw mm0, BYTE_BIT
+ por mm6, mm0 ; mm6=Y
+ movq MMWORD [edi], mm6 ; Save Y
+
+ pxor mm2, mm2
+ pxor mm4, mm4
+ punpcklwd mm2, mm3 ; mm2=REL
+ punpckhwd mm4, mm3 ; mm4=REH
+ psrld mm2, 1 ; mm2=REL*FIX(0.500)
+ psrld mm4, 1 ; mm4=REH*FIX(0.500)
+
+ movq mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+ paddd mm1, mm2
+ paddd mm5, mm4
+ paddd mm1, mm0
+ paddd mm5, mm0
+ psrld mm1, SCALEBITS ; mm1=CrEL
+ psrld mm5, SCALEBITS ; mm5=CrEH
+ packssdw mm1, mm5 ; mm1=CrE
+
+ psllw mm7, BYTE_BIT
+ por mm1, mm7 ; mm1=Cr
+ movq MMWORD [edx], mm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_MMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
+ add edi, byte SIZEOF_MMWORD ; outptr0
+ add ebx, byte SIZEOF_MMWORD ; outptr1
+ add edx, byte SIZEOF_MMWORD ; outptr2
+ cmp ecx, byte SIZEOF_MMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jccolext-sse2.asm b/media/libjpeg/simd/i386/jccolext-sse2.asm
new file mode 100644
index 0000000000..c6c80852ac
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-sse2.asm
@@ -0,0 +1,503 @@
+;
+; jccolext.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ movd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH, xmmH
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+ movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm6
+ pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor xmm1, xmm1
+ pxor xmm6, xmm6
+ punpcklwd xmm1, xmm5 ; xmm1=BOL
+ punpckhwd xmm6, xmm5 ; xmm6=BOH
+ psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
+
+ movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm1
+ paddd xmm4, xmm6
+ paddd xmm7, xmm5
+ paddd xmm4, xmm5
+ psrld xmm7, SCALEBITS ; xmm7=CbOL
+ psrld xmm4, SCALEBITS ; xmm4=CbOH
+ packssdw xmm7, xmm4 ; xmm7=CbO
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm6
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor xmm0, xmm0
+ pxor xmm6, xmm6
+ punpcklwd xmm0, xmm1 ; xmm0=BEL
+ punpckhwd xmm6, xmm1 ; xmm6=BEH
+ psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
+
+ movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm5, xmm0
+ paddd xmm4, xmm6
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrld xmm5, SCALEBITS ; xmm5=CbEL
+ psrld xmm4, SCALEBITS ; xmm4=CbEH
+ packssdw xmm5, xmm4 ; xmm5=CbE
+
+ psllw xmm7, BYTE_BIT
+ por xmm5, xmm7 ; xmm5=Cb
+ movdqa XMMWORD [ebx], xmm5 ; Save Cb
+
+ movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ movdqa xmm7, xmm0
+ movdqa xmm5, xmm4
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, XMMWORD [wk(4)]
+ paddd xmm4, XMMWORD [wk(5)]
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ punpcklwd xmm3, xmm1 ; xmm3=ROL
+ punpckhwd xmm4, xmm1 ; xmm4=ROH
+ psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
+
+ movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm3
+ paddd xmm5, xmm4
+ paddd xmm7, xmm1
+ paddd xmm5, xmm1
+ psrld xmm7, SCALEBITS ; xmm7=CrOL
+ psrld xmm5, SCALEBITS ; xmm5=CrOH
+ packssdw xmm7, xmm5 ; xmm7=CrO
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(6)]
+ paddd xmm4, XMMWORD [wk(7)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [edi], xmm6 ; Save Y
+
+ pxor xmm2, xmm2
+ pxor xmm4, xmm4
+ punpcklwd xmm2, xmm3 ; xmm2=REL
+ punpckhwd xmm4, xmm3 ; xmm4=REH
+ psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
+
+ movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+ paddd xmm1, xmm2
+ paddd xmm5, xmm4
+ paddd xmm1, xmm0
+ paddd xmm5, xmm0
+ psrld xmm1, SCALEBITS ; xmm1=CrEL
+ psrld xmm5, SCALEBITS ; xmm5=CrEH
+ packssdw xmm1, xmm5 ; xmm1=CrE
+
+ psllw xmm7, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Cr
+ movdqa XMMWORD [edx], xmm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_XMMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add edi, byte SIZEOF_XMMWORD ; outptr0
+ add ebx, byte SIZEOF_XMMWORD ; outptr1
+ add edx, byte SIZEOF_XMMWORD ; outptr2
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jccolor-avx2.asm b/media/libjpeg/simd/i386/jccolor-avx2.asm
new file mode 100644
index 0000000000..14944e952f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-mmx.asm b/media/libjpeg/simd/i386/jccolor-mmx.asm
new file mode 100644
index 0000000000..8cb399bdc4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-mmx.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PW_MF016_MF033 times 2 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 2 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-sse2.asm b/media/libjpeg/simd/i386/jccolor-sse2.asm
new file mode 100644
index 0000000000..686d222ff7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-avx2.asm b/media/libjpeg/simd/i386/jcgray-avx2.asm
new file mode 100644
index 0000000000..560ee0c71e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-mmx.asm b/media/libjpeg/simd/i386/jcgray-mmx.asm
new file mode 100644
index 0000000000..79fdf082a8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-mmx.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
+
+EXTN(jconst_rgb_gray_convert_mmx):
+
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-sse2.asm b/media/libjpeg/simd/i386/jcgray-sse2.asm
new file mode 100644
index 0000000000..cb4b28e8f4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgryext-avx2.asm b/media/libjpeg/simd/i386/jcgryext-avx2.asm
new file mode 100644
index 0000000000..3fa7973d72
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-avx2.asm
@@ -0,0 +1,457 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ vmovd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ vmovd xmmF, XMM_DWORD [esi+ecx]
+ vpslldq xmmA, xmmA, SIZEOF_DWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ vmovq xmmB, XMM_MMWORD [esi+ecx]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ sub ecx, byte SIZEOF_XMMWORD
+ vmovdqu xmmB, XMM_MMWORD [esi+ecx]
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ vpor ymmA, ymmB
+.column_ld32:
+ test cl, SIZEOF_YMMWORD
+ jz short .column_ld64
+ sub ecx, byte SIZEOF_YMMWORD
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+ test cl, 2*SIZEOF_YMMWORD
+ mov ecx, SIZEOF_YMMWORD
+ jz short .rgb_gray_cnv
+ vmovdqa ymmB, ymmA
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+ ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vmovdqu ymmC, ymmA
+ vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+ vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+ vmovdqa ymmG, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+ ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+ vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+ ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+ ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+ vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+ ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+ vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+ ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+ vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+ ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+ vmovdqa ymmD, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+ ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+ vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+ ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+ vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+ ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+ ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+ ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+ vmovdqa ymmE, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+ ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+ vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+ ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+ ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+ ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+ vpxor ymmH, ymmH, ymmH
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmB, ymmE
+ vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+ vmovdqa ymmF, ymmD
+ vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ vmovdqa xmmF, xmmA
+ vperm2i128 ymmF, ymmF, ymmF, 1
+ vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+ vpor ymmA, ymmA, ymmF
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_XMMWORD/2
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ mov ecx, SIZEOF_YMMWORD
+ jz short .rgb_gray_cnv
+ vmovdqa ymmE, ymmA
+ vmovdqa ymmH, ymmF
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+ vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+ ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmB, ymmA
+ vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+ vmovdqa ymmB, ymmF
+ vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+ vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmD, ymmA
+ vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+ ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+ vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+ ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+ vmovdqa ymmC, ymmF
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+ ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+ ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+ vmovdqa ymmB, ymmA
+ vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+ vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+ ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+ vmovdqa ymmG, ymmD
+ vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+ ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+ vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+ ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+ vmovdqa ymmE, ymmA
+ vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vmovdqa ymmH, ymmB
+ vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+ ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+ ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+ vpxor ymmF, ymmF, ymmF
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmD, ymmB
+ vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+ vmovdqa ymmG, ymmE
+ vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vpunpcklbw ymmF, ymmF, ymmH
+ vpunpckhbw ymmH, ymmH, ymmH
+ vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+ vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+ ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ vmovdqa ymm6, ymm1
+ vpunpcklwd ymm1, ymm1, ymm3
+ vpunpckhwd ymm6, ymm6, ymm3
+ vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vmovdqa ymm6, ymm0
+ vpunpcklwd ymm0, ymm0, ymm2
+ vpunpckhwd ymm6, ymm6, ymm2
+ vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vmovdqa ymm0, ymm5 ; ymm0=BO
+ vmovdqa ymm6, ymm4 ; ymm6=BE
+
+ vmovdqa ymm4, ymm0
+ vpunpcklwd ymm0, ymm0, ymm3
+ vpunpckhwd ymm4, ymm4, ymm3
+ vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF]
+
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm4, ymm4, ymm7
+ vpaddd ymm0, ymm0, ymm3
+ vpaddd ymm4, ymm4, ymm3
+ vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
+ vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
+
+ vmovdqa ymm4, ymm6
+ vpunpcklwd ymm6, ymm6, ymm2
+ vpunpckhwd ymm4, ymm4, ymm2
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF]
+
+ vpaddd ymm6, ymm6, YMMWORD [wk(0)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(1)]
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm4, ymm4, ymm2
+ vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
+ vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpor ymm6, ymm6, ymm0 ; ymm6=Y
+ vmovdqu YMMWORD [edi], ymm6 ; Save Y
+
+ sub ecx, byte SIZEOF_YMMWORD
+ add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
+ add edi, byte SIZEOF_YMMWORD ; outptr0
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcgryext-mmx.asm b/media/libjpeg/simd/i386/jcgryext-mmx.asm
new file mode 100644
index 0000000000..8af42e5a33
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-mmx.asm
@@ -0,0 +1,355 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
+
+EXTN(jsimd_rgb_gray_convert_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ xor eax, eax
+ mov al, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ xor edx, edx
+ mov dx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ movd mmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd mmG, dword [esi+ecx]
+ psllq mmA, DWORD_BIT
+ por mmA, mmG
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ movq mmG, mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ mov ecx, SIZEOF_MMWORD
+ jmp short .rgb_gray_cnv
+.column_ld16:
+ test cl, 2*SIZEOF_MMWORD
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_gray_cnv
+ movq mmF, mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+ ; mmA=(00 10 20 01 11 21 02 12)
+ ; mmG=(22 03 13 23 04 14 24 05)
+ ; mmF=(15 25 06 16 26 07 17 27)
+
+ movq mmD, mmA
+ psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
+ psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
+
+ punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05)
+ psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
+
+ punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16)
+ punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27)
+
+ movq mmE, mmA
+ psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
+ psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
+
+ punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
+ psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
+
+ punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07)
+ punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27)
+
+ pxor mmH, mmH
+
+ movq mmC, mmA
+ punpcklbw mmA, mmH ; mmA=(00 02 04 06)
+ punpckhbw mmC, mmH ; mmC=(10 12 14 16)
+
+ movq mmB, mmE
+ punpcklbw mmE, mmH ; mmE=(20 22 24 26)
+ punpckhbw mmB, mmH ; mmB=(01 03 05 07)
+
+ movq mmF, mmD
+ punpcklbw mmD, mmH ; mmD=(11 13 15 17)
+ punpckhbw mmF, mmH ; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_MMWORD/8
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_MMWORD/8
+ movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_MMWORD/4
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_MMWORD/4
+ movq mmF, mmA
+ movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+ test cl, SIZEOF_MMWORD/2
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_gray_cnv
+ movq mmD, mmA
+ movq mmC, mmF
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+ movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+ ; mmA=(00 10 20 30 01 11 21 31)
+ ; mmF=(02 12 22 32 03 13 23 33)
+ ; mmD=(04 14 24 34 05 15 25 35)
+ ; mmC=(06 16 26 36 07 17 27 37)
+
+ movq mmB, mmA
+ punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32)
+ punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33)
+
+ movq mmG, mmD
+ punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36)
+ punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37)
+
+ movq mmE, mmA
+ punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
+ punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36)
+
+ movq mmH, mmB
+ punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17)
+ punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37)
+
+ pxor mmF, mmF
+
+ movq mmC, mmA
+ punpcklbw mmA, mmF ; mmA=(00 02 04 06)
+ punpckhbw mmC, mmF ; mmC=(10 12 14 16)
+
+ movq mmD, mmB
+ punpcklbw mmB, mmF ; mmB=(01 03 05 07)
+ punpckhbw mmD, mmF ; mmD=(11 13 15 17)
+
+ movq mmG, mmE
+ punpcklbw mmE, mmF ; mmE=(20 22 24 26)
+ punpckhbw mmG, mmF ; mmG=(30 32 34 36)
+
+ punpcklbw mmF, mmH
+ punpckhbw mmH, mmH
+ psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27)
+ psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+ ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ movq mm6, mm1
+ punpcklwd mm1, mm3
+ punpckhwd mm6, mm3
+ pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movq mm6, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm6, mm2
+ pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movq mm0, mm5 ; mm0=BO
+ movq mm6, mm4 ; mm6=BE
+
+ movq mm4, mm0
+ punpcklwd mm0, mm3
+ punpckhwd mm4, mm3
+ pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
+
+ paddd mm0, mm1
+ paddd mm4, mm7
+ paddd mm0, mm3
+ paddd mm4, mm3
+ psrld mm0, SCALEBITS ; mm0=YOL
+ psrld mm4, SCALEBITS ; mm4=YOH
+ packssdw mm0, mm4 ; mm0=YO
+
+ movq mm4, mm6
+ punpcklwd mm6, mm2
+ punpckhwd mm4, mm2
+ pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
+
+ paddd mm6, MMWORD [wk(0)]
+ paddd mm4, MMWORD [wk(1)]
+ paddd mm6, mm2
+ paddd mm4, mm2
+ psrld mm6, SCALEBITS ; mm6=YEL
+ psrld mm4, SCALEBITS ; mm4=YEH
+ packssdw mm6, mm4 ; mm6=YE
+
+ psllw mm0, BYTE_BIT
+ por mm6, mm0 ; mm6=Y
+ movq MMWORD [edi], mm6 ; Save Y
+
+ sub ecx, byte SIZEOF_MMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
+ add edi, byte SIZEOF_MMWORD ; outptr0
+ cmp ecx, byte SIZEOF_MMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcgryext-sse2.asm b/media/libjpeg/simd/i386/jcgryext-sse2.asm
new file mode 100644
index 0000000000..c9d6ff1e35
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-sse2.asm
@@ -0,0 +1,382 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ movd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .rgb_gray_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH, xmmH
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa xmm0, xmm5 ; xmm0=BO
+ movdqa xmm6, xmm4 ; xmm6=BE
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, xmm1
+ paddd xmm4, xmm7
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(0)]
+ paddd xmm4, XMMWORD [wk(1)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [edi], xmm6 ; Save Y
+
+ sub ecx, byte SIZEOF_XMMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add edi, byte SIZEOF_XMMWORD ; outptr0
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jchuff-sse2.asm b/media/libjpeg/simd/i386/jchuff-sse2.asm
new file mode 100644
index 0000000000..278cf5e83a
--- /dev/null
+++ b/media/libjpeg/simd/i386/jchuff-sse2.asm
@@ -0,0 +1,761 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte: resp 1 ; => next byte to write in buffer
+.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
+.cur.free_bits resd 1 ; # of bits available in it
+.cur.last_dc_val resd 4 ; last DC coef for each component
+.cinfo: resp 1 ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco: resd 256 ; code for each symbol
+.ehufsi: resb 256 ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+ alignz 32
+
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+ dq 0x000f, 0x001f, 0x003f, 0x007f
+ dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+ dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 << 9 db 10
+times 1 << 8 db 9
+times 1 << 7 db 8
+times 1 << 6 db 7
+times 1 << 5 db 6
+times 1 << 4 db 5
+times 1 << 3 db 4
+times 1 << 2 db 3
+times 1 << 1 db 2
+times 1 << 0 db 1
+times 1 db 0
+jpeg_nbits_table:
+times 1 db 0
+times 1 << 0 db 1
+times 1 << 1 db 2
+times 1 << 2 db 3
+times 1 << 3 db 4
+times 1 << 4 db 5
+times 1 << 5 db 6
+times 1 << 6 db 7
+times 1 << 7 db 8
+times 1 << 8 db 9
+times 1 << 9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+
+ alignz 32
+
+%ifdef PIC
+%define NBITS(x) nbits_base + x
+%else
+%define NBITS(x) jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%define mm_put_buffer mm0
+%define mm_all_0xff mm1
+%define mm_temp mm2
+%define mm_nbits mm3
+%define mm_code_bits mm3
+%define mm_code mm4
+%define mm_overflow_bits mm5
+%define mm_save_nbits mm6
+
+; Shorthand used to describe SIMD operations:
+; wN: xmmN treated as eight signed 16-bit values
+; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
+; bN: xmmN treated as 16 unsigned 8-bit values, or
+; mmN treated as eight unsigned 8-bit values
+; bN[i]: perform the same operation on all unsigned 8-bit values,
+; i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits. temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 9
+%define %%temp %1
+%define %%tempb %2
+%define %%temph %3
+ add nbits, free_bits ; nbits += free_bits;
+ neg free_bits ; free_bits = -free_bits;
+ movq mm_temp, mm_code ; temp = code;
+ movd mm_nbits, nbits ; nbits --> MMX register
+ movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits;
+ neg free_bits ; free_bits = -free_bits;
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits;
+ add free_bits, 64 ; free_bits += 64;
+ por mm_temp, mm_put_buffer ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+ movd mm_save_nbits, nbits_base ; save nbits_base
+%endif
+ movq mm_code_bits, mm_temp ; code_bits (temp register) = temp;
+ movq mm_put_buffer, mm_code ; put_buffer = code;
+ pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+ movq mm_code, mm_code_bits ; code = code_bits;
+ psrlq mm_code_bits, 32 ; code_bits >>= 32;
+ pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i);
+ movd %%temp, mm_code_bits ; temp = code_bits;
+ bswap %%temp ; temp = htonl(temp);
+ test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */
+ jnz %%.SLOW ; goto %%.SLOW
+ mov dword [buffer], %%temp ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+ movd nbits_base, mm_save_nbits ; restore nbits_base
+%endif
+ %4
+ movd nbits, mm_code ; nbits = (uint32_t)(code);
+ %5
+ bswap nbits ; nbits = htonl(nbits);
+ mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits;
+ lea buffer, [buffer + 8] ; buffer += 8;
+ %6
+ %7
+ %8
+ jmp %9 ; return
+%%.SLOW:
+ ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+ ; bytes in the qword.
+ mov byte [buffer], %%tempb ; buffer[0] = temp[0];
+ cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], %%temph ; buffer[0] = temp[1];
+ cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr %%temp, 16 ; temp >>= 16;
+ mov byte [buffer], %%tempb ; buffer[0] = temp[0];
+ cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], %%temph ; buffer[0] = temp[1];
+ cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+ movd nbits_base, mm_save_nbits ; restore nbits_base
+%endif
+ bswap nbits ; nbits = htonl(nbits)
+ mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
+ cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
+ cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+ shr nbits, 16 ; nbits >>= 16;
+ mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
+ cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
+ %4
+ cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+ %5
+ %6
+ %7
+ %8
+ jmp %9 ; return;
+%endmacro
+
+%macro PUSH 1
+ push %1
+%assign stack_offset stack_offset + 4
+%endmacro
+
+%macro POP 1
+ pop %1
+%assign stack_offset stack_offset - 4
+%endmacro
+
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register. Equivalent to
+; get_GOT %1
+; lea %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+ call %%.geteip
+%%.ref:
+ %4
+ add %1, %2 - %%.ref
+ jmp short %%.done
+ align 32
+%%.geteip:
+ %3 4 ; must adjust stack pointer because of call
+ mov %1, POINTER [esp]
+ ret
+ align 32
+%%.done:
+%else
+ %3 0
+ %4
+%endif
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+; JCOEFPTR block, int last_dc_val,
+; c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel. In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support. The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.) This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
+
+%define frame eax
+%ifdef PIC
+%define nbits_base ebx
+%endif
+%define emit_temp ebx
+%define emit_tempb bl
+%define emit_temph bh
+%define dctbl ecx
+%define block edx
+%define code_temp esi
+%define index_temp edi
+%define t esp
+%define index ebp
+
+%assign save_frame DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%assign stack_offset 0
+%define arg_state 4 + stack_offset
+%define arg_buffer 8 + stack_offset
+%define arg_block 12 + stack_offset
+%define arg_last_dc_val 16 + stack_offset
+%define arg_dctbl 20 + stack_offset
+%define arg_actbl 24 + stack_offset
+
+ ;X: X = code stream
+ mov block, [esp + arg_block]
+ PUSH ebx
+ PUSH ebp
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
+ PUSH esi
+ PUSH edi
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
+ mov frame, esp
+ lea t, [frame - (save_frame + 4)]
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
+ and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0]
+ mov [t + save_frame], frame
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
+ pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
+ pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
+ punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
+ punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
+ pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
+ ;A: (Row 0, offset 1)
+ pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+ paddw xmm0, xmm4 ;A: w0[i] += w4[i];
+ movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
+
+ movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
+ pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
+ pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
+ movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
+ movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
+ punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
+ pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
+ pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
+ ; (Row 1, offset 1)
+ pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+ paddw xmm1, xmm4 ;B: w1[i] += w4[i];
+ movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
+ pxor xmm4, xmm4 ;B: w4[i] = 0;
+ pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+ packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+ ; w/ signed saturation
+
+ pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
+ pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
+ pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
+ pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
+ ; (Row 3, offset 1)
+ pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+ paddw xmm3, xmm4 ;D: w3[i] += w4[i];
+ movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
+ pxor xmm4, xmm4 ;D: w4[i] = 0;
+ pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+ pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
+ pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
+ pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
+ pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
+ pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
+ pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
+ ; (Row 2, offset 1)
+ pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+ paddw xmm2, xmm4 ;C: w2[i] += w4[i];
+ movsx code_temp, word [block] ;Z: code_temp = block[0];
+
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+ movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+ ;C: t[i+16] = w2[i];
+ pxor xmm4, xmm4 ;C: w4[i] = 0;
+ pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+ sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val;
+
+ packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+ ; w/ signed saturation
+
+ movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
+ pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i);
+ pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i);
+ movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
+ punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
+ shl index_temp, 16 ;Z: index_temp <<= 16;
+ psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
+ pxor xmm2, xmm2 ;H: w2[i] = 0;
+ pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
+ or index, index_temp ;Z: index |= index_temp;
+%undef index_temp
+%define free_bits edi
+%endmacro
+
+%macro GET_SYM_AFTER 0
+ movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
+ unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
+ pxor xmm0, xmm0 ;H: w0[i] = 0;
+ not index ;Z: index = ~index;
+ pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
+ ; (Row 7, offset 1)
+ pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+ mov dctbl, [frame + arg_dctbl]
+ paddw xmm3, xmm2 ;H: w3[i] += w2[i];
+ movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
+ movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
+ pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+ punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
+ movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
+ pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF;
+%endmacro
+
+ GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
+
+ psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
+ shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
+ pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
+ pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
+ pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
+ pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
+ cmp code_temp, 1 << 31 ;Z: Set CF if code_temp < 0x80000000,
+ ;Z: i.e. if code_temp is positive
+ pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
+ movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
+ pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
+ pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
+ pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
+ ; (Row 6, offset 1)
+ adc code_temp, -1 ;Z: code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+ pxor xmm2, xmm2 ;G: w2[i] = 0;
+ pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
+ paddw xmm4, xmm0 ;G: w4[i] += w0[i];
+ movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
+ movd mm_temp, code_temp ;Z: temp = code_temp
+ pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
+ ; (Row 5, offset 1)
+ pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+
+ packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+ ; w/ signed saturation
+
+ lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1]
+ pxor xmm0, xmm0 ;F: w0[i] = 0;
+ pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+ paddw xmm1, xmm2 ;F: w1[i] += w2[i];
+ movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
+ pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+ pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
+ pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
+ pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
+ pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
+ ; (Row 4, offset 1)
+%undef block
+%define nbits edx
+%define nbitsb dl
+%define nbitsh dh
+ movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state esi
+ pxor xmm2, xmm2 ;E: w2[i] = 0;
+ mov state, [frame + arg_state]
+ movd mm_nbits, nbits ;Z: nbits --> MMX register
+ pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+ movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+ ;Z: code = dctbl->ehufco[nbits];
+%define size ecx
+%define sizeb cl
+%define sizeh ch
+ paddw xmm5, xmm0 ;E: w5[i] += w0[i];
+ movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
+ movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+ ;Z: size = dctbl->ehufsi[nbits];
+%undef dctbl
+ pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+ packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+ ; w/ signed saturation
+
+ movq mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+ ;Z: put_buffer = state->cur.put_buffer.simd;
+ mov free_bits, [state + working_state.cur.free_bits]
+ ;Z: free_bits = state->cur.free_bits;
+%undef state
+%define actbl esi
+ mov actbl, [frame + arg_actbl]
+%define buffer eax
+ mov buffer, [frame + arg_buffer]
+%undef frame
+ jmp .BEGIN
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+; size <= 32, so this is not really a loop
+.BRLOOP1: ; .BRLOOP1:
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+ ; nbits = actbl->ehufsi[0xf0];
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+ ; code = actbl->ehufco[0xf0];
+ and index, 0x7ffffff ; clear index if size == 32
+ sub size, 16 ; size -= 16;
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1;
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ jmp .ERLOOP1 ; goto .ERLOOP1;
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+%ifdef PIC
+ times 6 nop
+%else
+ times 2 nop
+%endif
+.BLOOP1: ; do { /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged. Correct in .BRLOOP.
+ shr index, sizeb ; index >>= size;
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
+ cmp size, 16 ; if (size > 16)
+ jg .BRLOOP1 ; goto .BRLOOP1;
+.ERLOOP1: ; .ERLOOP1:
+ movsx nbits, word [t] ; nbits = *t;
+%ifdef PIC
+ add size, size ; size += size;
+%else
+ lea size, [size * 2] ; size += size;
+%endif
+ movd mm_temp, nbits ; temp = nbits;
+ movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
+ lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
+ movd mm_nbits, nbits ; nbits --> MMX register
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+ ; code = actbl->ehufco[size-16];
+ movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+ ; size = actbl->ehufsi[size-16];
+.BEGIN: ; .BEGIN:
+ pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
+ psllq mm_code, mm_nbits ; code <<= nbits;
+ add nbits, size ; nbits += size;
+ por mm_code, mm_temp ; code |= temp;
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1
+ xor size, size ; size = 0; /* kill tzcnt input dependency */
+ tzcnt size, index ; size = # of trailing 0 bits in index
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ inc size ; ++size;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ test index, index
+ jnz .BLOOP1 ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1: ; .ELOOP1:
+ pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i);
+ pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i);
+ shl size, 16 ; size <<= 16;
+ or index, size ; index |= size;
+ not index ; index = ~index;
+ lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+ ; nbits = t + 1 + 64;
+ and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */
+ sub nbits, t ; nbits -= t;
+ shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */
+ tzcnt size, index ; size = # of trailing 0 bits in index
+ inc size ; ++size;
+ test index, index ; if (index == 0)
+ jz .ELOOP2 ; goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+ shr index, sizeb ; index >>= size;
+ lea size, [size + nbits - 33] ; size = size + nbits - 33;
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
+ cmp size, 16 ; if (size <= 16)
+ jle .ERLOOP2 ; goto .ERLOOP2;
+.BRLOOP2: ; do {
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+ ; nbits = actbl->ehufsi[0xf0];
+ sub size, 16 ; size -= 16;
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+ ; code = actbl->ehufco[0xf0];
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_BRLOOP2 ; insert code and flush put_buffer
+ movd mm_nbits, nbits ; else { nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ cmp size, 16 ; if (size <= 16)
+ jle .ERLOOP2 ; goto .ERLOOP2;
+ jmp .BRLOOP2 ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.BLOOP2: ; do { /* size = # of zero bits/elements to skip */
+ shr index, sizeb ; index >>= size;
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
+ cmp size, 16 ; if (size > 16)
+ jg .BRLOOP2 ; goto .BRLOOP2;
+.ERLOOP2: ; .ERLOOP2:
+ movsx nbits, word [t] ; nbits = *t;
+ add size, size ; size += size;
+ movd mm_temp, nbits ; temp = nbits;
+ movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
+ movd mm_nbits, nbits ; nbits --> MMX register
+ lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+ ; code = actbl->ehufco[size-16];
+ movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+ ; size = actbl->ehufsi[size-16];
+ psllq mm_code, mm_nbits ; code <<= nbits;
+ pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
+ lea nbits, [nbits + size] ; nbits += size;
+ por mm_code, mm_temp ; code |= temp;
+ xor size, size ; size = 0; /* kill tzcnt input dependency */
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2
+ tzcnt size, index ; size = # of trailing 0 bits in index
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ inc size ; ++size;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ test index, index
+ jnz .BLOOP2 ; } while (index != 0);
+.ELOOP2: ; .ELOOP2:
+ mov nbits, t ; nbits = t;
+ lea t, [t + SIZEOF_WORD] ; t = &t[1];
+ and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127;
+ and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */
+ cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2)
+ je .EFN ; {
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+ ; code = actbl->ehufco[0];
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+ ; nbits = actbl->ehufsi[0];
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jg .EFN_SKIP_EMIT_CODE ; {
+ EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer
+ align 16
+.EFN_SKIP_EMIT_CODE: ; } else {
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+.EFN: ; } }
+%define frame esp
+ mov frame, [t + save_frame]
+%define state ecx
+ mov state, [frame + arg_state]
+ movq [state + working_state.cur.put_buffer.simd], mm_put_buffer
+ ; state->cur.put_buffer.simd = put_buffer;
+ emms
+ mov [state + working_state.cur.free_bits], free_bits
+ ; state->cur.free_bits = free_bits;
+ POP edi
+ POP esi
+ POP ebp
+ POP ebx
+ ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_BRLOOP1:
+ EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \
+ .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_ERLOOP1:
+ EMIT_QWORD size, sizeb, sizeh, \
+ { xor size, size }, \
+ { tzcnt size, index }, \
+ { inc size }, \
+ { test index, index }, \
+ { jnz .BLOOP1 }, \
+ .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_BRLOOP2:
+ EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \
+ { cmp size, 16 }, \
+ { jle .ERLOOP2 }, \
+ .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_ERLOOP2:
+ EMIT_QWORD size, sizeb, sizeh, \
+ { xor size, size }, \
+ { tzcnt size, index }, \
+ { inc size }, \
+ { test index, index }, \
+ { jnz .BLOOP2 }, \
+ .ELOOP2
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcphuff-sse2.asm b/media/libjpeg/simd/i386/jcphuff-sse2.asm
new file mode 100644
index 0000000000..c26b48a47d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcphuff-sse2.asm
@@ -0,0 +1,662 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding. See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+ pxor N0, N0
+ pxor N1, N1
+
+ mov T0, INT [LUT + 0*SIZEOF_INT]
+ mov T1, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0, INT [LUT + 1*SIZEOF_INT]
+ mov T1, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ mov T0, INT [LUT + 2*SIZEOF_INT]
+ mov T1, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ mov T0, INT [LUT + 3*SIZEOF_INT]
+ mov T1, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ mov T0, INT [LUT + 4*SIZEOF_INT]
+ mov T1, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ mov T0, INT [LUT + 5*SIZEOF_INT]
+ mov T1, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ mov T0, INT [LUT + 6*SIZEOF_INT]
+ mov T1, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+
+ mov T0, INT [LUT + 7*SIZEOF_INT]
+ mov T1, INT [LUT + 15*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+ pinsrw X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+ pxor N0, N0
+ pxor N1, N1
+ pxor X1, X1
+
+ mov T0, INT [LUT + 0*SIZEOF_INT]
+ mov T1, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+
+ cmp LENEND, 2
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+ pxor N0, N0
+
+ mov T0, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+
+ mov T0, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+ pxor N0, N0
+ pxor X0, X0
+
+ mov T1, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 0
+
+ cmp LENEND, 2
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+ movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
+ movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
+ movdqa xmm2, XMMWORD [VALUES + (16*2)]
+ movdqa xmm3, XMMWORD [VALUES + (24*2)]
+ movdqa xmm4, XMMWORD [VALUES + (32*2)]
+ movdqa xmm5, XMMWORD [VALUES + (40*2)]
+ movdqa xmm6, XMMWORD [VALUES + (48*2)]
+
+ pcmpeqw xmm0, ZERO
+ pcmpeqw xmm1, ZERO
+ pcmpeqw xmm2, ZERO
+ pcmpeqw xmm3, ZERO
+ pcmpeqw xmm4, ZERO
+ pcmpeqw xmm5, ZERO
+ pcmpeqw xmm6, ZERO
+ pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
+
+ packsswb xmm0, xmm1
+ packsswb xmm2, xmm3
+ packsswb xmm4, xmm5
+ packsswb xmm6, xmm7
+
+ pmovmskb eax, xmm0
+ pmovmskb ecx, xmm2
+ pmovmskb edx, xmm4
+ pmovmskb esi, xmm6
+
+ shl ecx, 16
+ shl esi, 16
+
+ or eax, ecx
+ or edx, esi
+
+ not eax
+ not edx
+
+ mov edi, ZEROBITS
+
+ mov INT [edi], eax
+ mov INT [edi+SIZEOF_INT], edx
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *values,
+; size_t *zerobits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *zerobits
+
+%define ZERO xmm7
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define LENEND eax
+%define LUT ebx
+%define T0 ecx
+%define T1 edx
+%define BLOCK esi
+%define VALUES edi
+%define LEN ebp
+
+%define ZEROBITS INT [esp + 5 * 4]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ sub esp, 4
+ push ebx
+ push ecx
+; push edx ; need not be preserved
+ push esi
+ push edi
+ push ebp
+
+ mov BLOCK, INT [eax + 8]
+ mov LUT, INT [eax + 12]
+ mov VALUES, INT [eax + 24]
+ movd AL, INT [eax + 20]
+ mov T0, INT [eax + 28]
+ mov ZEROBITS, T0
+ mov LEN, INT [eax + 16]
+ pxor ZERO, ZERO
+ mov K, LEN
+ and K, -16
+ shr K, 4
+ jz .ELOOP16
+.BLOOP16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ dec K
+ jnz .BLOOP16
+ test LEN, 15
+ je .PADDING
+.ELOOP16:
+ mov LENEND, LEN
+ and LENEND, 7
+
+ test LEN, 8
+ jz .TRY7
+ test LEN, 7
+ jz .TRY8
+
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ jmp .PADDING
+.TRY8:
+ LOAD8
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+ jmp .PADDING
+.TRY7:
+ LOAD7
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+.PADDING:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDING
+ align 16
+.ZEROLOOP:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOP
+.EPADDING:
+ sub VALUES, DCTSIZE2*2
+
+ REDUCE0
+
+ pop ebp
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+ pop ecx
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *absvalues,
+; size_t *bits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *bits
+
+%define ZERO xmm7
+%define ONE xmm5
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define LENEND eax
+%define LUT ebx
+%define T0 ecx
+%define T0w cx
+%define T1 edx
+%define BLOCK esi
+%define VALUES edi
+%define KK ebp
+
+%define ZEROBITS INT [esp + 5 * 4]
+%define EOB INT [esp + 5 * 4 + 4]
+%define LEN INT [esp + 5 * 4 + 8]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ sub esp, 16
+ push ebx
+ push ecx
+; push edx ; need not be preserved
+ push esi
+ push edi
+ push ebp
+
+ pcmpeqw ONE, ONE
+ psrlw ONE, 15
+ mov BLOCK, INT [eax + 8]
+ mov LUT, INT [eax + 12]
+ mov VALUES, INT [eax + 24]
+ movd AL, INT [eax + 20]
+ mov T0, INT [eax + 28]
+ mov K, INT [eax + 16]
+ mov INT [T0 + 2 * SIZEOF_INT], -1
+ mov INT [T0 + 3 * SIZEOF_INT], -1
+ mov ZEROBITS, T0
+ mov LEN, K
+ pxor ZERO, ZERO
+ and K, -16
+ mov EOB, 0
+ xor KK, KK
+ shr K, 4
+ jz .ELOOPR16
+.BLOOPR16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER16 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER16:
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ add KK, 2
+ dec K
+ jnz .BLOOPR16
+ test LEN, 15
+ je .PADDINGR
+.ELOOPR16:
+ mov LENEND, LEN
+
+ test LENEND, 8
+ jz .TRYR7
+ test LENEND, 7
+ jz .TRYR8
+
+ and LENEND, 7
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER15 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER15:
+ add VALUES, 16*2
+ jmp .PADDINGR
+.TRYR8:
+ LOAD8
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER8 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER8:
+ add VALUES, 8*2
+ jmp .PADDINGR
+.TRYR7:
+ and LENEND, 7
+ LOAD7
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER7 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER7:
+ add VALUES, 8*2
+.PADDINGR:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDINGR
+ align 16
+.ZEROLOOPR:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOPR
+.EPADDINGR:
+ sub VALUES, DCTSIZE2*2
+
+ REDUCE0
+
+ mov eax, EOB
+
+ pop ebp
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+ pop ecx
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcsample-avx2.asm b/media/libjpeg/simd/i386/jcsample-avx2.asm
new file mode 100644
index 0000000000..0a20802dd8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-avx2.asm
@@ -0,0 +1,388 @@
+;
+; jcsample.asm - downsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00010000 ; bias pattern
+ vmovd xmm7, edx
+ vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+.columnloop_r24:
+ ; ecx can possibly be 8, 16, 24
+ cmp ecx, 24
+ jne .columnloop_r16
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r16:
+ cmp ecx, 16
+ jne .columnloop_r8
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vpxor ymm1, ymm1, ymm1
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r8:
+ vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
+ vpxor ymm1, ymm1, ymm1
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+ vpsrlw ymm2, ymm0, BYTE_BIT
+ vpand ymm0, ymm0, ymm6
+ vpsrlw ymm3, ymm1, BYTE_BIT
+ vpand ymm1, ymm1, ymm6
+
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm1, ymm1, ymm7
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+ sub ecx, byte SIZEOF_YMMWORD ; outcol
+ add esi, byte 2*SIZEOF_YMMWORD ; inptr
+ add edi, byte 1*SIZEOF_YMMWORD ; outptr
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+ test ecx, ecx
+ jnz near .columnloop_r24
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00020001 ; bias pattern
+ vmovd xmm7, edx
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+ vperm2i128 ymm7, ymm7, ymm7, 0
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+.columnloop_r24:
+ cmp ecx, 24
+ jne .columnloop_r16
+ vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
+ vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r16:
+ cmp ecx, 16
+ jne .columnloop_r8
+ vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm3, ymm3, ymm3
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r8:
+ vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm3, ymm3, ymm3
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
+ vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+ vpand ymm4, ymm0, ymm6
+ vpsrlw ymm0, ymm0, BYTE_BIT
+ vpand ymm5, ymm1, ymm6
+ vpsrlw ymm1, ymm1, BYTE_BIT
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+
+ vpand ymm4, ymm2, ymm6
+ vpsrlw ymm2, ymm2, BYTE_BIT
+ vpand ymm5, ymm3, ymm6
+ vpsrlw ymm3, ymm3, BYTE_BIT
+ vpaddw ymm2, ymm2, ymm4
+ vpaddw ymm3, ymm3, ymm5
+
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm7
+ vpsrlw ymm0, ymm0, 2
+ vpsrlw ymm2, ymm2, 2
+
+ vpackuswb ymm0, ymm0, ymm2
+ vpermq ymm0, ymm0, 0xd8
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+ sub ecx, byte SIZEOF_YMMWORD ; outcol
+ add edx, byte 2*SIZEOF_YMMWORD ; inptr0
+ add esi, byte 2*SIZEOF_YMMWORD ; inptr1
+ add edi, byte 1*SIZEOF_YMMWORD ; outptr
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .columnloop_r24
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcsample-mmx.asm b/media/libjpeg/simd/i386/jcsample-mmx.asm
new file mode 100644
index 0000000000..2c223eebe8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-mmx.asm
@@ -0,0 +1,324 @@
+;
+; jcsample.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00010000 ; bias pattern
+ movd mm7, edx
+ pcmpeqw mm6, mm6
+ punpckldq mm7, mm7 ; mm7={0, 1, 0, 1}
+ psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mm2, mm0
+ movq mm3, mm1
+
+ pand mm0, mm6
+ psrlw mm2, BYTE_BIT
+ pand mm1, mm6
+ psrlw mm3, BYTE_BIT
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+ paddw mm0, mm7
+ paddw mm1, mm7
+ psrlw mm0, 1
+ psrlw mm1, 1
+
+ packuswb mm0, mm1
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+ add esi, byte 2*SIZEOF_MMWORD ; inptr
+ add edi, byte 1*SIZEOF_MMWORD ; outptr
+ sub ecx, byte SIZEOF_MMWORD ; outcol
+ jnz short .columnloop
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg short .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00020001 ; bias pattern
+ movd mm7, edx
+ pcmpeqw mm6, mm6
+ punpckldq mm7, mm7 ; mm7={1, 2, 1, 2}
+ psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov edi, JSAMPROW [edi] ; outptr
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+ movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+ movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ pand mm0, mm6
+ psrlw mm4, BYTE_BIT
+ pand mm1, mm6
+ psrlw mm5, BYTE_BIT
+ paddw mm0, mm4
+ paddw mm1, mm5
+
+ movq mm4, mm2
+ movq mm5, mm3
+ pand mm2, mm6
+ psrlw mm4, BYTE_BIT
+ pand mm3, mm6
+ psrlw mm5, BYTE_BIT
+ paddw mm2, mm4
+ paddw mm3, mm5
+
+ paddw mm0, mm1
+ paddw mm2, mm3
+ paddw mm0, mm7
+ paddw mm2, mm7
+ psrlw mm0, 2
+ psrlw mm2, 2
+
+ packuswb mm0, mm2
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+ add edx, byte 2*SIZEOF_MMWORD ; inptr0
+ add esi, byte 2*SIZEOF_MMWORD ; inptr1
+ add edi, byte 1*SIZEOF_MMWORD ; outptr
+ sub ecx, byte SIZEOF_MMWORD ; outcol
+ jnz near .columnloop
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcsample-sse2.asm b/media/libjpeg/simd/i386/jcsample-sse2.asm
new file mode 100644
index 0000000000..4fea60d2e2
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-sse2.asm
@@ -0,0 +1,351 @@
+;
+; jcsample.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00010000 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ pxor xmm1, xmm1
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .downsample
+ alignx 16, 7
+
+.columnloop:
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ pand xmm0, xmm6
+ psrlw xmm2, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm3, BYTE_BIT
+
+ paddw xmm0, xmm2
+ paddw xmm1, xmm3
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ psrlw xmm0, 1
+ psrlw xmm1, 1
+
+ packuswb xmm0, xmm1
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+ sub ecx, byte SIZEOF_XMMWORD ; outcol
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add edi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ test ecx, ecx
+ jnz short .columnloop_r8
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00020001 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ pxor xmm2, xmm2
+ pxor xmm3, xmm3
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .downsample
+ alignx 16, 7
+
+.columnloop:
+ movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ pand xmm0, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ pand xmm2, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm3, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ paddw xmm0, xmm1
+ paddw xmm2, xmm3
+ paddw xmm0, xmm7
+ paddw xmm2, xmm7
+ psrlw xmm0, 2
+ psrlw xmm2, 2
+
+ packuswb xmm0, xmm2
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+ sub ecx, byte SIZEOF_XMMWORD ; outcol
+ add edx, byte 2*SIZEOF_XMMWORD ; inptr0
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr1
+ add edi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .columnloop_r8
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdcolext-avx2.asm b/media/libjpeg/simd/i386/jdcolext-avx2.asm
new file mode 100644
index 0000000000..015be0416c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-avx2.asm
@@ -0,0 +1,515 @@
+;
+; jdcolext.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+%define out_width(b) (b) + 8 ; JDIMENSION out_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b) + 16 ; JDIMENSION input_row
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16, 7
+.columnloop:
+
+ vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+ vmovdqu ymm1, YMMWORD [edx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm0, ymm0, ymm0
+ vpcmpeqw ymm7, ymm7, ymm7
+ vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+ vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+ vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+ vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+ vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+ vpaddw ymm2, ymm4, ymm7
+ vpaddw ymm3, ymm5, ymm7
+ vpaddw ymm6, ymm0, ymm7
+ vpaddw ymm7, ymm1, ymm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE
+ vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO
+ vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE
+ vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO
+
+ vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbE * -FIX(0.22800))
+ vpmulhw ymm5, ymm5, [GOTOFF(eax,PW_MF0228)] ; ymm5=(2*CbO * -FIX(0.22800))
+ vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrE * FIX(0.40200))
+ vpmulhw ymm1, ymm1, [GOTOFF(eax,PW_F0402)] ; ymm1=(2*CrO * FIX(0.40200))
+
+ vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+ vpaddw ymm5, ymm5, [GOTOFF(eax,PW_ONE)]
+ vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800))
+ vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800))
+ vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+ vpaddw ymm1, ymm1, [GOTOFF(eax,PW_ONE)]
+ vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200))
+ vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200))
+
+ vpaddw ymm4, ymm4, ymm2
+ vpaddw ymm5, ymm5, ymm3
+ vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+ vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+ vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+ vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E
+ vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O
+
+ vpunpckhwd ymm4, ymm2, ymm6
+ vpunpcklwd ymm2, ymm2, ymm6
+ vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpunpckhwd ymm5, ymm3, ymm7
+ vpunpcklwd ymm3, ymm3, ymm7
+ vpmaddwd ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+ vpaddd ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)]
+ vpsrad ymm2, ymm2, SCALEBITS
+ vpsrad ymm4, ymm4, SCALEBITS
+ vpaddd ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)]
+ vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+ vpsrad ymm3, ymm3, SCALEBITS
+ vpsrad ymm5, ymm5, SCALEBITS
+
+ vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ vmovdqu ymm5, YMMWORD [esi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm4, ymm4, ymm4
+ vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+ vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE
+ vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+ vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+ vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+ vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
+ vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+ vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+ vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
+ vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+ vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+ vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
+ vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+ ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+ ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+ ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+ vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+ ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+ vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+ ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+ ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+ vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+ ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+ vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+ ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+ vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+ ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+ vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+ ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+ vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+ ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+ vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+ ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+ vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+ ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+ vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+ vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+ vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+ ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+ vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+ ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+ vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+ ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+ vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+ ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+ vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+ vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+ vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test edi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_YMMWORD ; inptr0
+ add ebx, byte SIZEOF_YMMWORD ; inptr1
+ add edx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st64:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_YMMWORD
+ jb short .column_st32
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmF
+ sub ecx, byte 2*SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st32:
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st31
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ add edi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub ecx, byte SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st31:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ vmovq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ vpsrldq xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ vmovd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ vpsrldq xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ vmovd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ mov byte [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+ vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+ ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+ vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+ vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+ ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+ ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+ vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+ ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+ vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+ ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+ vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+ vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+ vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test edi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+ vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+ vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+ add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_YMMWORD ; inptr0
+ add ebx, byte SIZEOF_YMMWORD ; inptr1
+ add edx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st64:
+ cmp ecx, byte SIZEOF_YMMWORD/2
+ jb short .column_st32
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmC
+ vmovdqa ymmD, ymmH
+ sub ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+ cmp ecx, byte SIZEOF_YMMWORD/4
+ jb short .column_st16
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ add edi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+ cmp ecx, byte SIZEOF_YMMWORD/8
+ jb short .column_st15
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+ ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_YMMWORD/16
+ jb short .column_st7
+ vmovq MMWORD [edi], xmmA
+ add edi, byte SIZEOF_YMMWORD/16*4
+ sub ecx, byte SIZEOF_YMMWORD/16
+ vpsrldq xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+ ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ vmovd XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16, 7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdcolext-mmx.asm b/media/libjpeg/simd/i386/jdcolext-mmx.asm
new file mode 100644
index 0000000000..5813cfcb66
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-mmx.asm
@@ -0,0 +1,404 @@
+;
+; jdcolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+%define out_width(b) (b) + 8 ; JDIMENSION out_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b) + 16 ; JDIMENSION input_row
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16, 7
+.columnloop:
+
+ movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
+ movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
+
+ pcmpeqw mm4, mm4
+ pcmpeqw mm7, mm7
+ psrlw mm4, BYTE_BIT
+ psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+ movq mm0, mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand mm4, mm5 ; mm4=Cb(0246)=CbE
+ psrlw mm5, BYTE_BIT ; mm5=Cb(1357)=CbO
+ pand mm0, mm1 ; mm0=Cr(0246)=CrE
+ psrlw mm1, BYTE_BIT ; mm1=Cr(1357)=CrO
+
+ paddw mm4, mm7
+ paddw mm5, mm7
+ paddw mm0, mm7
+ paddw mm1, mm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movq mm2, mm4 ; mm2=CbE
+ movq mm3, mm5 ; mm3=CbO
+ paddw mm4, mm4 ; mm4=2*CbE
+ paddw mm5, mm5 ; mm5=2*CbO
+ movq mm6, mm0 ; mm6=CrE
+ movq mm7, mm1 ; mm7=CrO
+ paddw mm0, mm0 ; mm0=2*CrE
+ paddw mm1, mm1 ; mm1=2*CrO
+
+ pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
+ pmulhw mm5, [GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
+ pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
+ pmulhw mm1, [GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
+
+ paddw mm4, [GOTOFF(eax,PW_ONE)]
+ paddw mm5, [GOTOFF(eax,PW_ONE)]
+ psraw mm4, 1 ; mm4=(CbE * -FIX(0.22800))
+ psraw mm5, 1 ; mm5=(CbO * -FIX(0.22800))
+ paddw mm0, [GOTOFF(eax,PW_ONE)]
+ paddw mm1, [GOTOFF(eax,PW_ONE)]
+ psraw mm0, 1 ; mm0=(CrE * FIX(0.40200))
+ psraw mm1, 1 ; mm1=(CrO * FIX(0.40200))
+
+ paddw mm4, mm2
+ paddw mm5, mm3
+ paddw mm4, mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw mm5, mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw mm0, mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw mm1, mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
+ movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
+
+ movq mm4, mm2
+ movq mm5, mm3
+ punpcklwd mm2, mm6
+ punpckhwd mm4, mm6
+ pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd mm3, mm7
+ punpckhwd mm5, mm7
+ pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd mm4, [GOTOFF(eax,PD_ONEHALF)]
+ psrad mm2, SCALEBITS
+ psrad mm4, SCALEBITS
+ paddd mm3, [GOTOFF(eax,PD_ONEHALF)]
+ paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
+ psrad mm3, SCALEBITS
+ psrad mm5, SCALEBITS
+
+ packssdw mm2, mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw mm3, mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw mm2, mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw mm3, mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movq mm5, MMWORD [esi] ; mm5=Y(01234567)
+
+ pcmpeqw mm4, mm4
+ psrlw mm4, BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
+ pand mm4, mm5 ; mm4=Y(0246)=YE
+ psrlw mm5, BYTE_BIT ; mm5=Y(1357)=YO
+
+ paddw mm0, mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+ paddw mm1, mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+ packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
+ packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+ paddw mm2, mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+ paddw mm3, mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+ packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
+ packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+ paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+ paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+ packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
+ packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+ punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07)
+ punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27)
+
+ movq mmG, mmA
+ movq mmH, mmA
+ punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03)
+ punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07)
+
+ psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
+ psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
+
+ movq mmC, mmD
+ movq mmB, mmD
+ punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14)
+ punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --)
+
+ psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
+
+ movq mmF, mmE
+ punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25)
+ punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --)
+
+ punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12)
+ punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05)
+ punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st16:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_MMWORD
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq mmA, mmC
+ sub ecx, byte 2*SIZEOF_MMWORD
+ add edi, byte 2*SIZEOF_MMWORD
+ jmp short .column_st4
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA, mmE
+ sub ecx, byte SIZEOF_MMWORD
+ add edi, byte SIZEOF_MMWORD
+.column_st4:
+ movd eax, mmA
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st2
+ mov dword [edi+0*SIZEOF_DWORD], eax
+ psrlq mmA, DWORD_BIT
+ movd eax, mmA
+ sub ecx, byte SIZEOF_DWORD
+ add edi, byte SIZEOF_DWORD
+.column_st2:
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi+0*SIZEOF_WORD], ax
+ shr eax, WORD_BIT
+ sub ecx, byte SIZEOF_WORD
+ add edi, byte SIZEOF_WORD
+.column_st1:
+ cmp ecx, byte SIZEOF_BYTE
+ jb short .nextrow
+ mov byte [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+ pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+ punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36)
+ punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17)
+ punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37)
+
+ movq mmC, mmA
+ punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32)
+ punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36)
+ movq mmG, mmB
+ punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33)
+ punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37)
+
+ movq mmD, mmA
+ punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31)
+ punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33)
+ movq mmH, mmC
+ punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35)
+ punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st16:
+ cmp ecx, byte SIZEOF_MMWORD/2
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq mmA, mmC
+ movq mmD, mmH
+ sub ecx, byte SIZEOF_MMWORD/2
+ add edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD/4
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA, mmD
+ sub ecx, byte SIZEOF_MMWORD/4
+ add edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+ cmp ecx, byte SIZEOF_MMWORD/8
+ jb short .nextrow
+ movd dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16, 7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdcolext-sse2.asm b/media/libjpeg/simd/i386/jdcolext-sse2.asm
new file mode 100644
index 0000000000..d5572b3294
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-sse2.asm
@@ -0,0 +1,458 @@
+;
+; jdcolext.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+%define out_width(b) (b) + 8 ; JDIMENSION out_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b) + 16 ; JDIMENSION input_row
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16, 7
+.columnloop:
+
+ movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
+ movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm7, xmm7
+ psrlw xmm4, BYTE_BIT
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+ movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
+ psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
+ pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
+ psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
+
+ paddw xmm4, xmm7
+ paddw xmm5, xmm7
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm2, xmm4 ; xmm2=CbE
+ movdqa xmm3, xmm5 ; xmm3=CbO
+ paddw xmm4, xmm4 ; xmm4=2*CbE
+ paddw xmm5, xmm5 ; xmm5=2*CbO
+ movdqa xmm6, xmm0 ; xmm6=CrE
+ movdqa xmm7, xmm1 ; xmm7=CrO
+ paddw xmm0, xmm0 ; xmm0=2*CrE
+ paddw xmm1, xmm1 ; xmm1=2*CrO
+
+ pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
+ pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
+ pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
+ pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
+
+ paddw xmm4, [GOTOFF(eax,PW_ONE)]
+ paddw xmm5, [GOTOFF(eax,PW_ONE)]
+ psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
+ psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
+ paddw xmm0, [GOTOFF(eax,PW_ONE)]
+ paddw xmm1, [GOTOFF(eax,PW_ONE)]
+ psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
+ psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
+
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm4, xmm6
+ pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd xmm3, xmm7
+ punpckhwd xmm5, xmm7
+ pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm2, SCALEBITS
+ psrad xmm4, SCALEBITS
+ paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm3, SCALEBITS
+ psrad xmm5, SCALEBITS
+
+ packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
+ psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
+
+ paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+ paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st32:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub ecx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ mov byte [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st32:
+ cmp ecx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD/8*4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ movd XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16, 7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdcolor-avx2.asm b/media/libjpeg/simd/i386/jdcolor-avx2.asm
new file mode 100644
index 0000000000..e05b60d001
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402 times 16 dw F_0_402
+PW_MF0228 times 16 dw -F_0_228
+PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
+PW_ONE times 16 dw 1
+PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-mmx.asm b/media/libjpeg/simd/i386/jdcolor-mmx.asm
new file mode 100644
index 0000000000..fb7e7bcce4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-mmx.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402 times 4 dw F_0_402
+PW_MF0228 times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE times 4 dw 1
+PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
+%include "jdcolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-sse2.asm b/media/libjpeg/simd/i386/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b736255317
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-avx2.asm b/media/libjpeg/simd/i386/jdmerge-avx2.asm
new file mode 100644
index 0000000000..711e6792d0
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402 times 16 dw F_0_402
+PW_MF0228 times 16 dw -F_0_228
+PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
+PW_ONE times 16 dw 1
+PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-mmx.asm b/media/libjpeg/simd/i386/jdmerge-mmx.asm
new file mode 100644
index 0000000000..6e8311d408
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-mmx.asm
@@ -0,0 +1,123 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402 times 4 dw F_0_402
+PW_MF0228 times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE times 4 dw 1
+PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-sse2.asm b/media/libjpeg/simd/i386/jdmerge-sse2.asm
new file mode 100644
index 0000000000..e32f90aa17
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmrgext-avx2.asm b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..e35f7282bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
@@ -0,0 +1,575 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 3
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [output_width(eax)] ; col
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [in_row_group_ctr(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
+ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
+ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+
+ pop ecx ; col
+
+ alignx 16, 7
+.columnloop:
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ vmovdqu ymm6, YMMWORD [ebx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+ vmovdqu ymm7, YMMWORD [edx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+ vpcmpeqw ymm3, ymm3, ymm3
+ vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+ vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+ vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL
+ vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+ vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL
+ vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+ vpaddw ymm5, ymm6, ymm3
+ vpaddw ymm2, ymm4, ymm3
+ vpaddw ymm1, ymm7, ymm3
+ vpaddw ymm3, ymm0, ymm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH
+ vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL
+ vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH
+ vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL
+
+ vpmulhw ymm6, ymm6, [GOTOFF(eax,PW_MF0228)] ; ymm6=(2*CbH * -FIX(0.22800))
+ vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbL * -FIX(0.22800))
+ vpmulhw ymm7, ymm7, [GOTOFF(eax,PW_F0402)] ; ymm7=(2*CrH * FIX(0.40200))
+ vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrL * FIX(0.40200))
+
+ vpaddw ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
+ vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+ vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800))
+ vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800))
+ vpaddw ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
+ vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+ vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200))
+ vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200))
+
+ vpaddw ymm6, ymm6, ymm5
+ vpaddw ymm4, ymm4, ymm2
+ vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+ vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+ vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+ vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H
+ vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H
+
+ vpunpckhwd ymm6, ymm5, ymm1
+ vpunpcklwd ymm5, ymm5, ymm1
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpunpckhwd ymm7, ymm2, ymm3
+ vpunpcklwd ymm2, ymm2, ymm3
+ vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+ vpaddd ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
+ vpsrad ymm5, ymm5, SCALEBITS
+ vpsrad ymm6, ymm6, SCALEBITS
+ vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+ vpaddd ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
+ vpsrad ymm2, ymm2, SCALEBITS
+ vpsrad ymm7, ymm7, SCALEBITS
+
+ vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+ alignx 16, 7
+
+.Yloop_2nd:
+ vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H
+ vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H
+ vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H
+ alignx 16, 7
+
+.Yloop_1st:
+ vmovdqu ymm7, YMMWORD [esi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+ vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE
+ vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+ vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H)
+ vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H)
+ vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H)
+
+ vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+ vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+ vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
+ vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+ vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+ vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
+ vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+ vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+ vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
+ vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+ ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+ ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+ ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+ vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+ ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+ vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+ ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+ ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+ vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+ ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+ vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+ ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+ vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+ ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+ vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+ ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+ vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+ ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+ vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+ ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+ vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+ ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+ vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+ vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+ vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+ ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+ vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+ ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+ vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+ ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+ vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+ ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+ vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+ vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+ vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test edi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_YMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_YMMWORD ; inptr1
+ add edx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st64:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_YMMWORD
+ jb short .column_st32
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmF
+ sub ecx, byte 2*SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st32:
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st31
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ add edi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub ecx, byte SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st31:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ vmovq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ vpsrldq xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ vmovd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ vpsrldq xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ vmovd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ mov byte [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+ vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+ ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+ vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+ vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+ ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+ ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+ vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+ ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+ vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+ ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+ vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+ vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+ vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test edi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+ vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+ vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+ add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_YMMWORD ; inptr0
+ dec al
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_YMMWORD ; inptr1
+ add edx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st64:
+ cmp ecx, byte SIZEOF_YMMWORD/2
+ jb short .column_st32
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmC
+ vmovdqa ymmD, ymmH
+ sub ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+ cmp ecx, byte SIZEOF_YMMWORD/4
+ jb short .column_st16
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ add edi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+ cmp ecx, byte SIZEOF_YMMWORD/8
+ jb short .column_st15
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+ ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_YMMWORD/16
+ jb short .column_st7
+ vmovq MMWORD [edi], xmmA
+ add edi, byte SIZEOF_YMMWORD/16*4
+ sub ecx, byte SIZEOF_YMMWORD/16
+ vpsrldq xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+ ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ vmovd XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, POINTER [output_width(ebp)]
+
+ mov edi, JSAMPIMAGE [input_buf(ebp)]
+ mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(ebp)]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+ push edx ; inptr2
+ push ebx ; inptr1
+ push esi ; inptr00
+ mov ebx, esp
+
+ push edi ; output_buf (outptr0)
+ push ecx ; in_row_group_ctr
+ push ebx ; input_buf
+ push eax ; output_width
+
+ call near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+ add esi, byte SIZEOF_JSAMPROW ; inptr01
+ add edi, byte SIZEOF_JSAMPROW ; outptr1
+ mov POINTER [ebx+0*SIZEOF_POINTER], esi
+ mov POINTER [ebx-1*SIZEOF_POINTER], edi
+
+ call near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+ add esp, byte 7*SIZEOF_DWORD
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdmrgext-mmx.asm b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
new file mode 100644
index 0000000000..eb3e36b475
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
@@ -0,0 +1,460 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 3
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [output_width(eax)] ; col
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [in_row_group_ctr(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
+ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
+ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+
+ pop ecx ; col
+
+ alignx 16, 7
+.columnloop:
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ movq mm6, MMWORD [ebx] ; mm6=Cb(01234567)
+ movq mm7, MMWORD [edx] ; mm7=Cr(01234567)
+
+ pxor mm1, mm1 ; mm1=(all 0's)
+ pcmpeqw mm3, mm3
+ psllw mm3, 7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+ movq mm4, mm6
+ punpckhbw mm6, mm1 ; mm6=Cb(4567)=CbH
+ punpcklbw mm4, mm1 ; mm4=Cb(0123)=CbL
+ movq mm0, mm7
+ punpckhbw mm7, mm1 ; mm7=Cr(4567)=CrH
+ punpcklbw mm0, mm1 ; mm0=Cr(0123)=CrL
+
+ paddw mm6, mm3
+ paddw mm4, mm3
+ paddw mm7, mm3
+ paddw mm0, mm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movq mm5, mm6 ; mm5=CbH
+ movq mm2, mm4 ; mm2=CbL
+ paddw mm6, mm6 ; mm6=2*CbH
+ paddw mm4, mm4 ; mm4=2*CbL
+ movq mm1, mm7 ; mm1=CrH
+ movq mm3, mm0 ; mm3=CrL
+ paddw mm7, mm7 ; mm7=2*CrH
+ paddw mm0, mm0 ; mm0=2*CrL
+
+ pmulhw mm6, [GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800))
+ pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800))
+ pmulhw mm7, [GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200))
+ pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200))
+
+ paddw mm6, [GOTOFF(eax,PW_ONE)]
+ paddw mm4, [GOTOFF(eax,PW_ONE)]
+ psraw mm6, 1 ; mm6=(CbH * -FIX(0.22800))
+ psraw mm4, 1 ; mm4=(CbL * -FIX(0.22800))
+ paddw mm7, [GOTOFF(eax,PW_ONE)]
+ paddw mm0, [GOTOFF(eax,PW_ONE)]
+ psraw mm7, 1 ; mm7=(CrH * FIX(0.40200))
+ psraw mm0, 1 ; mm0=(CrL * FIX(0.40200))
+
+ paddw mm6, mm5
+ paddw mm4, mm2
+ paddw mm6, mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw mm4, mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw mm7, mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw mm0, mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H
+ movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H
+
+ movq mm6, mm5
+ movq mm7, mm2
+ punpcklwd mm5, mm1
+ punpckhwd mm6, mm1
+ pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm6, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd mm2, mm3
+ punpckhwd mm7, mm3
+ pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
+ paddd mm6, [GOTOFF(eax,PD_ONEHALF)]
+ psrad mm5, SCALEBITS
+ psrad mm6, SCALEBITS
+ paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd mm7, [GOTOFF(eax,PD_ONEHALF)]
+ psrad mm2, SCALEBITS
+ psrad mm7, SCALEBITS
+
+ packssdw mm5, mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw mm2, mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw mm5, mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw mm2, mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+ alignx 16, 7
+
+.Yloop_2nd:
+ movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H
+ movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H
+ movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H
+ alignx 16, 7
+
+.Yloop_1st:
+ movq mm7, MMWORD [esi] ; mm7=Y(01234567)
+
+ pcmpeqw mm6, mm6
+ psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
+ pand mm6, mm7 ; mm6=Y(0246)=YE
+ psrlw mm7, BYTE_BIT ; mm7=Y(1357)=YO
+
+ movq mm1, mm0 ; mm1=mm0=(R-Y)(L/H)
+ movq mm3, mm2 ; mm3=mm2=(G-Y)(L/H)
+ movq mm5, mm4 ; mm5=mm4=(B-Y)(L/H)
+
+ paddw mm0, mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+ paddw mm1, mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+ packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
+ packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+ paddw mm2, mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+ paddw mm3, mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+ packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
+ packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+ paddw mm4, mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+ paddw mm5, mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+ packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
+ packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+ punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07)
+ punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27)
+
+ movq mmG, mmA
+ movq mmH, mmA
+ punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03)
+ punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07)
+
+ psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
+ psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
+
+ movq mmC, mmD
+ movq mmB, mmD
+ punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14)
+ punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --)
+
+ psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
+
+ movq mmF, mmE
+ punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25)
+ punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --)
+
+ punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12)
+ punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05)
+ punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz near .endcolumn
+
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st16:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_MMWORD
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq mmA, mmC
+ sub ecx, byte 2*SIZEOF_MMWORD
+ add edi, byte 2*SIZEOF_MMWORD
+ jmp short .column_st4
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA, mmE
+ sub ecx, byte SIZEOF_MMWORD
+ add edi, byte SIZEOF_MMWORD
+.column_st4:
+ movd eax, mmA
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st2
+ mov dword [edi+0*SIZEOF_DWORD], eax
+ psrlq mmA, DWORD_BIT
+ movd eax, mmA
+ sub ecx, byte SIZEOF_DWORD
+ add edi, byte SIZEOF_DWORD
+.column_st2:
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi+0*SIZEOF_WORD], ax
+ shr eax, WORD_BIT
+ sub ecx, byte SIZEOF_WORD
+ add edi, byte SIZEOF_WORD
+.column_st1:
+ cmp ecx, byte SIZEOF_BYTE
+ jb short .endcolumn
+ mov byte [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+ pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+ punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36)
+ punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17)
+ punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37)
+
+ movq mmC, mmA
+ punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32)
+ punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36)
+ movq mmG, mmB
+ punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33)
+ punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37)
+
+ movq mmD, mmA
+ punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31)
+ punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33)
+ movq mmH, mmC
+ punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35)
+ punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .endcolumn
+
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st16:
+ cmp ecx, byte SIZEOF_MMWORD/2
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq mmA, mmC
+ movq mmD, mmH
+ sub ecx, byte SIZEOF_MMWORD/2
+ add edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD/4
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA, mmD
+ sub ecx, byte SIZEOF_MMWORD/4
+ add edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+ cmp ecx, byte SIZEOF_MMWORD/8
+ jb short .endcolumn
+ movd dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, JDIMENSION [output_width(ebp)]
+
+ mov edi, JSAMPIMAGE [input_buf(ebp)]
+ mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(ebp)]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+ push edx ; inptr2
+ push ebx ; inptr1
+ push esi ; inptr00
+ mov ebx, esp
+
+ push edi ; output_buf (outptr0)
+ push ecx ; in_row_group_ctr
+ push ebx ; input_buf
+ push eax ; output_width
+
+ call near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+ add esi, byte SIZEOF_JSAMPROW ; inptr01
+ add edi, byte SIZEOF_JSAMPROW ; outptr1
+ mov POINTER [ebx+0*SIZEOF_POINTER], esi
+ mov POINTER [ebx-1*SIZEOF_POINTER], edi
+
+ call near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+ add esp, byte 7*SIZEOF_DWORD
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdmrgext-sse2.asm b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..c113dc4d27
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
@@ -0,0 +1,517 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 3
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [output_width(eax)] ; col
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [in_row_group_ctr(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
+ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
+ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+
+ pop ecx ; col
+
+ alignx 16, 7
+.columnloop:
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
+ movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
+
+ pxor xmm1, xmm1 ; xmm1=(all 0's)
+ pcmpeqw xmm3, xmm3
+ psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ movdqa xmm4, xmm6
+ punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
+ punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
+ movdqa xmm0, xmm7
+ punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
+ punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
+
+ paddw xmm6, xmm3
+ paddw xmm4, xmm3
+ paddw xmm7, xmm3
+ paddw xmm0, xmm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm5, xmm6 ; xmm5=CbH
+ movdqa xmm2, xmm4 ; xmm2=CbL
+ paddw xmm6, xmm6 ; xmm6=2*CbH
+ paddw xmm4, xmm4 ; xmm4=2*CbL
+ movdqa xmm1, xmm7 ; xmm1=CrH
+ movdqa xmm3, xmm0 ; xmm3=CrL
+ paddw xmm7, xmm7 ; xmm7=2*CrH
+ paddw xmm0, xmm0 ; xmm0=2*CrL
+
+ pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
+ pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
+ pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
+ pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
+
+ paddw xmm6, [GOTOFF(eax,PW_ONE)]
+ paddw xmm4, [GOTOFF(eax,PW_ONE)]
+ psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
+ psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
+ paddw xmm7, [GOTOFF(eax,PW_ONE)]
+ paddw xmm0, [GOTOFF(eax,PW_ONE)]
+ psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
+ psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
+
+ paddw xmm6, xmm5
+ paddw xmm4, xmm2
+ paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
+
+ movdqa xmm6, xmm5
+ movdqa xmm7, xmm2
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm6, xmm1
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd xmm2, xmm3
+ punpckhwd xmm7, xmm3
+ pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm6, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm5, SCALEBITS
+ psrad xmm6, SCALEBITS
+ paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm7, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm2, SCALEBITS
+ psrad xmm7, SCALEBITS
+
+ packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+ alignx 16, 7
+
+.Yloop_2nd:
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
+ alignx 16, 7
+
+.Yloop_1st:
+ movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm6, xmm6
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
+ psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
+
+ movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
+ movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
+ movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
+
+ paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+ paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st32:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub ecx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ mov byte [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st32:
+ cmp ecx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD/8*4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ movd XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, POINTER [output_width(ebp)]
+
+ mov edi, JSAMPIMAGE [input_buf(ebp)]
+ mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(ebp)]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+ push edx ; inptr2
+ push ebx ; inptr1
+ push esi ; inptr00
+ mov ebx, esp
+
+ push edi ; output_buf (outptr0)
+ push ecx ; in_row_group_ctr
+ push ebx ; input_buf
+ push eax ; output_width
+
+ call near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ add esi, byte SIZEOF_JSAMPROW ; inptr01
+ add edi, byte SIZEOF_JSAMPROW ; outptr1
+ mov POINTER [ebx+0*SIZEOF_POINTER], esi
+ mov POINTER [ebx-1*SIZEOF_POINTER], edi
+
+ call near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ add esp, byte 7*SIZEOF_DWORD
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdsample-avx2.asm b/media/libjpeg/simd/i386/jdsample-avx2.asm
new file mode 100644
index 0000000000..a800c35e08
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-avx2.asm
@@ -0,0 +1,760 @@
+;
+; jdsample.asm - upsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE times 16 dw 1
+PW_TWO times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+ push ebp
+ mov ebp, esp
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ test eax, SIZEOF_YMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
+ vpcmpeqb xmm7, xmm7, xmm7
+ vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
+ vpand ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+ add eax, byte SIZEOF_YMMWORD-1
+ and eax, byte -SIZEOF_YMMWORD
+ cmp eax, byte SIZEOF_YMMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ vpcmpeqb xmm6, xmm6, xmm6
+ vpslldq xmm6, xmm6, (SIZEOF_XMMWORD-1)
+ vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
+ vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ jmp short .upsample
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vperm2i128 ymm6, ymm0, ymm6, 0x20
+ vpslldq ymm6, ymm6, 15
+
+.upsample:
+ vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31)
+
+ vperm2i128 ymm2, ymm0, ymm1, 0x20
+ vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30)
+ vperm2i128 ymm4, ymm0, ymm1, 0x03
+ vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --)
+
+ vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30)
+ vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32)
+
+ vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --)
+
+ vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+ vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22)
+ vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24)
+ vpunpcklbw ymm0, ymm3, ymm0 ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+ vperm2i128 ymm3, ymm0, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vperm2i128 ymm6, ymm0, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
+
+ vpmullw ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+ vpaddw ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
+ vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
+ vpaddw ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
+ vpaddw ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
+
+ vpaddw ymm2, ymm2, ymm1
+ vpaddw ymm5, ymm5, ymm4
+ vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm3, ymm3, ymm1
+ vpaddw ymm6, ymm6, ymm4
+ vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm3, ymm3, BYTE_BIT
+ vpsllw ymm6, ymm6, BYTE_BIT
+ vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31)
+ vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
+
+ sub eax, byte SIZEOF_YMMWORD
+ add esi, byte 1*SIZEOF_YMMWORD ; inptr
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ cmp eax, byte SIZEOF_YMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 4
+%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov edx, eax ; edx = original ebp
+ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(edx)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(edx)] ; input_data
+ mov edi, POINTER [output_data_ptr(edx)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push ecx
+ push edi
+ push esi
+
+ mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test eax, SIZEOF_YMMWORD-1
+ jz short .skip
+ push edx
+ mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop edx
+.skip:
+ ; -- process the first column block
+
+ vmovdqu ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0]
+ vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
+ vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
+
+ vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm3, ymm2, ymm3 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+ vpcmpeqb xmm7, xmm7, xmm7
+ vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
+
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save
+ vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
+
+ vpand ymm1, ymm1, ymm7 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vpand ymm2, ymm2, ymm7 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+ vmovdqa YMMWORD [wk(0)], ymm1
+ vmovdqa YMMWORD [wk(1)], ymm2
+
+ poppic ebx
+
+ add eax, byte SIZEOF_YMMWORD-1
+ and eax, byte -SIZEOF_YMMWORD
+ cmp eax, byte SIZEOF_YMMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ vpcmpeqb xmm1, xmm1, xmm1
+ vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2)
+ vperm2i128 ymm1, ymm1, ymm1, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+ vpand ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
+ vpand ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
+
+ vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+ vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+ jmp near .upsample
+ alignx 16, 7
+
+.columnloop:
+ ; -- process the next column block
+
+ vmovdqu ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1]
+ vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
+ vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
+
+ vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm7, ymm2, ymm3 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vmovdqu YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save
+ vmovdqu YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
+
+ vperm2i128 ymm1, ymm3, ymm1, 0x20
+ vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
+ vperm2i128 ymm2, ymm3, ymm2, 0x20
+ vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
+
+ vmovdqa YMMWORD [wk(2)], ymm1
+ vmovdqa YMMWORD [wk(3)], ymm2
+
+.upsample:
+ ; -- process the upper row
+
+ vmovdqu ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vmovdqu ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+
+ vperm2i128 ymm0, ymm1, ymm7, 0x03
+ vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
+ vperm2i128 ymm4, ymm1, ymm3, 0x20
+ vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+ vperm2i128 ymm5, ymm1, ymm7, 0x03
+ vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm6, ymm1, ymm3, 0x20
+ vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vperm2i128 ymm2, ymm1, ymm3, 0x03
+ vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+ vperm2i128 ymm4, ymm1, ymm3, 0x03
+ vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm1, ymm1, ymm7, 0x20
+ vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+
+ vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vmovdqa YMMWORD [wk(0)], ymm4
+
+ vpmullw ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
+ vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+ vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
+ vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
+ vpaddw ymm2, [GOTOFF(ebx,PW_SEVEN)]
+
+ vpaddw ymm1, ymm1, ymm7
+ vpaddw ymm5, ymm5, ymm3
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm3
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpsllw ymm2, ymm2, BYTE_BIT
+ vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31)
+ vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
+
+ ; -- process the lower row
+
+ vmovdqu ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vmovdqu ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+
+ vperm2i128 ymm7, ymm1, ymm6, 0x03
+ vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
+ vperm2i128 ymm3, ymm1, ymm4, 0x20
+ vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+ vperm2i128 ymm0, ymm1, ymm6, 0x03
+ vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm2, ymm1, ymm4, 0x20
+ vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vperm2i128 ymm5, ymm1, ymm4, 0x03
+ vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+ vperm2i128 ymm3, ymm1, ymm4, 0x03
+ vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm1, ymm1, ymm6, 0x20
+ vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+
+ vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vmovdqa YMMWORD [wk(1)], ymm3
+
+ vpmullw ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+ vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+ vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
+ vpaddw ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
+ vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
+
+ vpaddw ymm1, ymm1, ymm6
+ vpaddw ymm0, ymm0, ymm4
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm7, ymm7, ymm6
+ vpaddw ymm5, ymm5, ymm4
+ vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpsllw ymm5, ymm5, BYTE_BIT
+ vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31)
+ vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
+
+ poppic ebx
+
+ sub eax, byte SIZEOF_YMMWORD
+ add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
+ add ebx, byte 1*SIZEOF_YMMWORD ; inptr0
+ add esi, byte 1*SIZEOF_YMMWORD ; inptr1(below)
+ add edx, byte 2*SIZEOF_YMMWORD ; outptr0
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr1
+ cmp eax, byte SIZEOF_YMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop ecx
+ pop eax
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (SIZEOF_YMMWORD-1)
+ and edx, -SIZEOF_YMMWORD
+ jz short .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ cmp eax, byte SIZEOF_YMMWORD
+ ja near .above_16
+
+ vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
+ vpunpckhbw xmm1, xmm0, xmm0
+ vpunpcklbw xmm0, xmm0, xmm0
+
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ jmp short .nextrow
+
+.above_16:
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm0
+ vpunpcklbw ymm0, ymm0, ymm0
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+ sub eax, byte 2*SIZEOF_YMMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_YMMWORD ; inptr
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg short .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (SIZEOF_YMMWORD-1)
+ and edx, -SIZEOF_YMMWORD
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ cmp eax, byte SIZEOF_YMMWORD
+ ja short .above_16
+
+ vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ vpunpckhbw xmm1, xmm0, xmm0
+ vpunpcklbw xmm0, xmm0, xmm0
+
+ vmovdqu XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ jmp near .nextrow
+
+.above_16:
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm0
+ vpunpcklbw ymm0, ymm0, ymm0
+
+ vmovdqu YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+ sub eax, byte 2*SIZEOF_YMMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_YMMWORD ; inptr
+ add ebx, 2*SIZEOF_YMMWORD ; outptr0
+ add edi, 2*SIZEOF_YMMWORD ; outptr1
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdsample-mmx.asm b/media/libjpeg/simd/i386/jdsample-mmx.asm
new file mode 100644
index 0000000000..12c49f0eab
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-mmx.asm
@@ -0,0 +1,731 @@
+;
+; jdsample.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE times 4 dw 1
+PW_TWO times 4 dw 2
+PW_THREE times 4 dw 3
+PW_SEVEN times 4 dw 7
+PW_EIGHT times 4 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+ push ebp
+ mov ebp, esp
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ test eax, SIZEOF_MMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ pxor mm0, mm0 ; mm0=(all 0's)
+ pcmpeqb mm7, mm7
+ psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
+ pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
+
+ add eax, byte SIZEOF_MMWORD-1
+ and eax, byte -SIZEOF_MMWORD
+ cmp eax, byte SIZEOF_MMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ pcmpeqb mm6, mm6
+ psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+ pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+ jmp short .upsample
+ alignx 16, 7
+
+.columnloop:
+ movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+ psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+ movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mm2, mm1
+ movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7)
+ psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
+ psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
+
+ por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6)
+ por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8)
+
+ movq mm7, mm1
+ psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
+
+ movq mm4, mm1
+ punpcklbw mm1, mm0 ; mm1=( 0 1 2 3)
+ punpckhbw mm4, mm0 ; mm4=( 4 5 6 7)
+ movq mm5, mm2
+ punpcklbw mm2, mm0 ; mm2=(-1 0 1 2)
+ punpckhbw mm5, mm0 ; mm5=( 3 4 5 6)
+ movq mm6, mm3
+ punpcklbw mm3, mm0 ; mm3=( 1 2 3 4)
+ punpckhbw mm6, mm0 ; mm6=( 5 6 7 8)
+
+ pmullw mm1, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm4, [GOTOFF(ebx,PW_THREE)]
+ paddw mm2, [GOTOFF(ebx,PW_ONE)]
+ paddw mm5, [GOTOFF(ebx,PW_ONE)]
+ paddw mm3, [GOTOFF(ebx,PW_TWO)]
+ paddw mm6, [GOTOFF(ebx,PW_TWO)]
+
+ paddw mm2, mm1
+ paddw mm5, mm4
+ psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6)
+ psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14)
+ paddw mm3, mm1
+ paddw mm6, mm4
+ psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7)
+ psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15)
+
+ psllw mm3, BYTE_BIT
+ psllw mm6, BYTE_BIT
+ por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
+ por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+ sub eax, byte SIZEOF_MMWORD
+ add esi, byte 1*SIZEOF_MMWORD ; inptr
+ add edi, byte 2*SIZEOF_MMWORD ; outptr
+ cmp eax, byte SIZEOF_MMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 4
+%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov edx, eax ; edx = original ebp
+ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(edx)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(edx)] ; input_data
+ mov edi, POINTER [output_data_ptr(edx)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push ecx
+ push edi
+ push esi
+
+ mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test eax, SIZEOF_MMWORD-1
+ jz short .skip
+ push edx
+ mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop edx
+.skip:
+ ; -- process the first column block
+
+ movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
+ movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
+ movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor mm3, mm3 ; mm3=(all 0's)
+ movq mm4, mm0
+ punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3)
+ punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7)
+ movq mm5, mm1
+ punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3)
+ punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7)
+ movq mm6, mm2
+ punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3)
+ punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7)
+
+ pmullw mm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm4, [GOTOFF(ebx,PW_THREE)]
+
+ pcmpeqb mm7, mm7
+ psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
+
+ paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
+ paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
+ paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
+ paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
+
+ movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
+ movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+ pand mm1, mm7 ; mm1=( 0 - - -)
+ pand mm2, mm7 ; mm2=( 0 - - -)
+
+ movq MMWORD [wk(0)], mm1
+ movq MMWORD [wk(1)], mm2
+
+ poppic ebx
+
+ add eax, byte SIZEOF_MMWORD-1
+ and eax, byte -SIZEOF_MMWORD
+ cmp eax, byte SIZEOF_MMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pcmpeqb mm1, mm1
+ psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
+ movq mm2, mm1
+
+ pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
+ pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
+
+ movq MMWORD [wk(2)], mm1
+ movq MMWORD [wk(3)], mm2
+
+ jmp short .upsample
+ alignx 16, 7
+
+.columnloop:
+ ; -- process the next column block
+
+ movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
+ movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
+ movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor mm3, mm3 ; mm3=(all 0's)
+ movq mm4, mm0
+ punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3)
+ punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7)
+ movq mm5, mm1
+ punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3)
+ punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7)
+ movq mm6, mm2
+ punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3)
+ punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7)
+
+ pmullw mm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm4, [GOTOFF(ebx,PW_THREE)]
+
+ paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
+ paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
+ paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
+ paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
+
+ movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
+ movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+ psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
+ psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
+
+ movq MMWORD [wk(2)], mm1
+ movq MMWORD [wk(3)], mm2
+
+.upsample:
+ ; -- process the upper row
+
+ movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
+ movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
+
+ movq mm0, mm7
+ movq mm4, mm3
+ psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -)
+ psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
+ movq mm5, mm7
+ movq mm6, mm3
+ psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
+ psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6)
+
+ por mm0, mm4 ; mm0=( 1 2 3 4)
+ por mm5, mm6 ; mm5=( 3 4 5 6)
+
+ movq mm1, mm7
+ movq mm2, mm3
+ psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
+ psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -)
+ movq mm4, mm3
+ psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
+
+ por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
+ por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
+
+ movq MMWORD [wk(0)], mm4
+
+ pmullw mm7, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm3, [GOTOFF(ebx,PW_THREE)]
+ paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw mm5, [GOTOFF(ebx,PW_EIGHT)]
+ paddw mm0, [GOTOFF(ebx,PW_SEVEN)]
+ paddw mm2, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw mm1, mm7
+ paddw mm5, mm3
+ psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6)
+ psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14)
+ paddw mm0, mm7
+ paddw mm2, mm3
+ psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7)
+ psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15)
+
+ psllw mm0, BYTE_BIT
+ psllw mm2, BYTE_BIT
+ por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
+ por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
+
+ movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
+ movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+ ; -- process the lower row
+
+ movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
+ movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
+
+ movq mm7, mm6
+ movq mm3, mm4
+ psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -)
+ psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
+ movq mm0, mm6
+ movq mm2, mm4
+ psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
+ psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6)
+
+ por mm7, mm3 ; mm7=( 1 2 3 4)
+ por mm0, mm2 ; mm0=( 3 4 5 6)
+
+ movq mm1, mm6
+ movq mm5, mm4
+ psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
+ psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -)
+ movq mm3, mm4
+ psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
+
+ por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
+ por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
+
+ movq MMWORD [wk(1)], mm3
+
+ pmullw mm6, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm4, [GOTOFF(ebx,PW_THREE)]
+ paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw mm0, [GOTOFF(ebx,PW_EIGHT)]
+ paddw mm7, [GOTOFF(ebx,PW_SEVEN)]
+ paddw mm5, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw mm1, mm6
+ paddw mm0, mm4
+ psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6)
+ psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14)
+ paddw mm7, mm6
+ paddw mm5, mm4
+ psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7)
+ psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15)
+
+ psllw mm7, BYTE_BIT
+ psllw mm5, BYTE_BIT
+ por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
+ por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+ poppic ebx
+
+ sub eax, byte SIZEOF_MMWORD
+ add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
+ add ebx, byte 1*SIZEOF_MMWORD ; inptr0
+ add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
+ add edx, byte 2*SIZEOF_MMWORD ; outptr0
+ add edi, byte 2*SIZEOF_MMWORD ; outptr1
+ cmp eax, byte SIZEOF_MMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop ecx
+ pop eax
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_MMWORD)-1
+ and edx, byte -(2*SIZEOF_MMWORD)
+ jz short .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+ movq mm1, mm0
+ punpcklbw mm0, mm0
+ punpckhbw mm1, mm1
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+ movq mm3, mm2
+ punpcklbw mm2, mm2
+ punpckhbw mm3, mm3
+
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_MMWORD ; inptr
+ add edi, byte 4*SIZEOF_MMWORD ; outptr
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg short .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_MMWORD)-1
+ and edx, byte -(2*SIZEOF_MMWORD)
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+ movq mm1, mm0
+ punpcklbw mm0, mm0
+ punpckhbw mm1, mm1
+
+ movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+ movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+ movq mm3, mm2
+ punpcklbw mm2, mm2
+ punpckhbw mm3, mm3
+
+ movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_MMWORD ; inptr
+ add ebx, byte 4*SIZEOF_MMWORD ; outptr0
+ add edi, byte 4*SIZEOF_MMWORD ; outptr1
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg short .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdsample-sse2.asm b/media/libjpeg/simd/i386/jdsample-sse2.asm
new file mode 100644
index 0000000000..4e28d2f4b8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-sse2.asm
@@ -0,0 +1,724 @@
+;
+; jdsample.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE times 8 dw 1
+PW_TWO times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+ push ebp
+ mov ebp, esp
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ test eax, SIZEOF_XMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ pxor xmm0, xmm0 ; xmm0=(all 0's)
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-1)
+ pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+ add eax, byte SIZEOF_XMMWORD-1
+ and eax, byte -SIZEOF_XMMWORD
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ pcmpeqb xmm6, xmm6
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+ pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ jmp short .upsample
+ alignx 16, 7
+
+.columnloop:
+ movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
+ pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
+ psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
+
+ por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
+ por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
+
+ movdqa xmm7, xmm1
+ psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+ movdqa xmm4, xmm1
+ punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm2
+ punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
+ punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
+ movdqa xmm6, xmm3
+ punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
+ punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
+
+ pmullw xmm1, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+ paddw xmm2, [GOTOFF(ebx,PW_ONE)]
+ paddw xmm5, [GOTOFF(ebx,PW_ONE)]
+ paddw xmm3, [GOTOFF(ebx,PW_TWO)]
+ paddw xmm6, [GOTOFF(ebx,PW_TWO)]
+
+ paddw xmm2, xmm1
+ paddw xmm5, xmm4
+ psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+ paddw xmm3, xmm1
+ paddw xmm6, xmm4
+ psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm3, BYTE_BIT
+ psllw xmm6, BYTE_BIT
+ por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+ sub eax, byte SIZEOF_XMMWORD
+ add esi, byte 1*SIZEOF_XMMWORD ; inptr
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ cmp eax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 4
+%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov edx, eax ; edx = original ebp
+ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(edx)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(edx)] ; input_data
+ mov edi, POINTER [output_data_ptr(edx)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push ecx
+ push edi
+ push esi
+
+ mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test eax, SIZEOF_XMMWORD-1
+ jz short .skip
+ push edx
+ mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop edx
+.skip:
+ ; -- process the first column block
+
+ movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
+ movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
+ movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-2)
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+ pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
+ pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
+
+ movdqa XMMWORD [wk(0)], xmm1
+ movdqa XMMWORD [wk(1)], xmm2
+
+ poppic ebx
+
+ add eax, byte SIZEOF_XMMWORD-1
+ and eax, byte -SIZEOF_XMMWORD
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pcmpeqb xmm1, xmm1
+ pslldq xmm1, (SIZEOF_XMMWORD-2)
+ movdqa xmm2, xmm1
+
+ pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+ pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+ movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
+
+ jmp near .upsample
+ alignx 16, 7
+
+.columnloop:
+ ; -- process the next column block
+
+ movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
+ movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+ pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
+ pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
+
+ movdqa XMMWORD [wk(2)], xmm1
+ movdqa XMMWORD [wk(3)], xmm2
+
+.upsample:
+ ; -- process the upper row
+
+ movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+ movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
+ pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm3
+ psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+ pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
+
+ por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
+ por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm3
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
+ movdqa xmm4, xmm3
+ psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(0)], xmm4
+
+ pmullw xmm7, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm3, [GOTOFF(ebx,PW_THREE)]
+ paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm5, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm0, [GOTOFF(ebx,PW_SEVEN)]
+ paddw xmm2, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw xmm1, xmm7
+ paddw xmm5, xmm3
+ psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+ paddw xmm0, xmm7
+ paddw xmm2, xmm3
+ psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm0, BYTE_BIT
+ psllw xmm2, BYTE_BIT
+ por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+ ; -- process the lower row
+
+ movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+ movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
+ pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
+ movdqa xmm0, xmm6
+ movdqa xmm2, xmm4
+ psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+ pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
+
+ por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
+ por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
+ movdqa xmm3, xmm4
+ psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(1)], xmm3
+
+ pmullw xmm6, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+ paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm0, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm7, [GOTOFF(ebx,PW_SEVEN)]
+ paddw xmm5, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw xmm1, xmm6
+ paddw xmm0, xmm4
+ psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+ paddw xmm7, xmm6
+ paddw xmm5, xmm4
+ psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm7, BYTE_BIT
+ psllw xmm5, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
+ por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+ poppic ebx
+
+ sub eax, byte SIZEOF_XMMWORD
+ add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
+ add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
+ add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
+ add edx, byte 2*SIZEOF_XMMWORD ; outptr0
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr1
+ cmp eax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop ecx
+ pop eax
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_XMMWORD)-1
+ and edx, byte -(2*SIZEOF_XMMWORD)
+ jz short .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
+
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add edi, byte 4*SIZEOF_XMMWORD ; outptr
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg short .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_XMMWORD)-1
+ and edx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+
+ movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
+
+ movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
+ add edi, byte 4*SIZEOF_XMMWORD ; outptr1
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg short .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctflt-3dn.asm b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
new file mode 100644
index 0000000000..322ab16325
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
@@ -0,0 +1,318 @@
+;
+; jfdctflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382 times 2 dd 0.382683432365089771728460
+PD_0_707 times 2 dd 0.707106781186547524400844
+PD_0_541 times 2 dd 0.541196100146196984399723
+PD_1_306 times 2 dd 1.306562964876376527856643
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
+;
+
+%define data(b) (b) + 8 ; FAST_FLOAT *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.rowloop:
+
+ movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+ movq mm4, mm0 ; transpose coefficients
+ punpckldq mm0, mm1 ; mm0=(00 10)=data0
+ punpckhdq mm4, mm1 ; mm4=(01 11)=data1
+ movq mm5, mm2 ; transpose coefficients
+ punpckldq mm2, mm3 ; mm2=(06 16)=data6
+ punpckhdq mm5, mm3 ; mm5=(07 17)=data7
+
+ movq mm6, mm4
+ movq mm7, mm0
+ pfsub mm4, mm2 ; mm4=data1-data6=tmp6
+ pfsub mm0, mm5 ; mm0=data0-data7=tmp7
+ pfadd mm6, mm2 ; mm6=data1+data6=tmp1
+ pfadd mm7, mm5 ; mm7=data0+data7=tmp0
+
+ movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+ movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
+
+ movq mm4, mm1 ; transpose coefficients
+ punpckldq mm1, mm3 ; mm1=(02 12)=data2
+ punpckhdq mm4, mm3 ; mm4=(03 13)=data3
+ movq mm0, mm2 ; transpose coefficients
+ punpckldq mm2, mm5 ; mm2=(04 14)=data4
+ punpckhdq mm0, mm5 ; mm0=(05 15)=data5
+
+ movq mm3, mm4
+ movq mm5, mm1
+ pfadd mm4, mm2 ; mm4=data3+data4=tmp3
+ pfadd mm1, mm0 ; mm1=data2+data5=tmp2
+ pfsub mm3, mm2 ; mm3=data3-data4=tmp4
+ pfsub mm5, mm0 ; mm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm2, mm7
+ movq mm0, mm6
+ pfsub mm7, mm4 ; mm7=tmp13
+ pfsub mm6, mm1 ; mm6=tmp12
+ pfadd mm2, mm4 ; mm2=tmp10
+ pfadd mm0, mm1 ; mm0=tmp11
+
+ pfadd mm6, mm7
+ pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+ movq mm4, mm2
+ movq mm1, mm7
+ pfsub mm2, mm0 ; mm2=data4
+ pfsub mm7, mm6 ; mm7=data6
+ pfadd mm4, mm0 ; mm4=data0
+ pfadd mm1, mm6 ; mm1=data2
+
+ movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+ movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [wk(0)] ; mm0=tmp6
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp7
+
+ pfadd mm3, mm5 ; mm3=tmp10
+ pfadd mm5, mm0 ; mm5=tmp11
+ pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
+
+ pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+ movq mm2, mm3 ; mm2=tmp10
+ pfsub mm3, mm0
+ pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
+ pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+ pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+ pfadd mm2, mm3 ; mm2=z2
+ pfadd mm0, mm3 ; mm0=z4
+
+ movq mm7, mm6
+ pfsub mm6, mm5 ; mm6=z13
+ pfadd mm7, mm5 ; mm7=z11
+
+ movq mm4, mm6
+ movq mm1, mm7
+ pfsub mm6, mm2 ; mm6=data3
+ pfsub mm7, mm0 ; mm7=data7
+ pfadd mm4, mm2 ; mm4=data5
+ pfadd mm1, mm0 ; mm1=data1
+
+ movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+ movq mm4, mm0 ; transpose coefficients
+ punpckldq mm0, mm1 ; mm0=(00 01)=data0
+ punpckhdq mm4, mm1 ; mm4=(10 11)=data1
+ movq mm5, mm2 ; transpose coefficients
+ punpckldq mm2, mm3 ; mm2=(60 61)=data6
+ punpckhdq mm5, mm3 ; mm5=(70 71)=data7
+
+ movq mm6, mm4
+ movq mm7, mm0
+ pfsub mm4, mm2 ; mm4=data1-data6=tmp6
+ pfsub mm0, mm5 ; mm0=data0-data7=tmp7
+ pfadd mm6, mm2 ; mm6=data1+data6=tmp1
+ pfadd mm7, mm5 ; mm7=data0+data7=tmp0
+
+ movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
+
+ movq mm4, mm1 ; transpose coefficients
+ punpckldq mm1, mm3 ; mm1=(20 21)=data2
+ punpckhdq mm4, mm3 ; mm4=(30 31)=data3
+ movq mm0, mm2 ; transpose coefficients
+ punpckldq mm2, mm5 ; mm2=(40 41)=data4
+ punpckhdq mm0, mm5 ; mm0=(50 51)=data5
+
+ movq mm3, mm4
+ movq mm5, mm1
+ pfadd mm4, mm2 ; mm4=data3+data4=tmp3
+ pfadd mm1, mm0 ; mm1=data2+data5=tmp2
+ pfsub mm3, mm2 ; mm3=data3-data4=tmp4
+ pfsub mm5, mm0 ; mm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm2, mm7
+ movq mm0, mm6
+ pfsub mm7, mm4 ; mm7=tmp13
+ pfsub mm6, mm1 ; mm6=tmp12
+ pfadd mm2, mm4 ; mm2=tmp10
+ pfadd mm0, mm1 ; mm0=tmp11
+
+ pfadd mm6, mm7
+ pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+ movq mm4, mm2
+ movq mm1, mm7
+ pfsub mm2, mm0 ; mm2=data4
+ pfsub mm7, mm6 ; mm7=data6
+ pfadd mm4, mm0 ; mm4=data0
+ pfadd mm1, mm6 ; mm1=data2
+
+ movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+ movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [wk(0)] ; mm0=tmp6
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp7
+
+ pfadd mm3, mm5 ; mm3=tmp10
+ pfadd mm5, mm0 ; mm5=tmp11
+ pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
+
+ pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+ movq mm2, mm3 ; mm2=tmp10
+ pfsub mm3, mm0
+ pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
+ pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+ pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+ pfadd mm2, mm3 ; mm2=z2
+ pfadd mm0, mm3 ; mm0=z4
+
+ movq mm7, mm6
+ pfsub mm6, mm5 ; mm6=z13
+ pfadd mm7, mm5 ; mm7=z11
+
+ movq mm4, mm6
+ movq mm1, mm7
+ pfsub mm6, mm2 ; mm6=data3
+ pfsub mm7, mm0 ; mm7=data7
+ pfadd mm4, mm2 ; mm4=data5
+ pfadd mm1, mm0 ; mm1=data1
+
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ add edx, byte 2*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .columnloop
+
+ femms ; empty MMX/3DNow! state
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctflt-sse.asm b/media/libjpeg/simd/i386/jfdctflt-sse.asm
new file mode 100644
index 0000000000..86952c6499
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-sse.asm
@@ -0,0 +1,369 @@
+;
+; jfdctflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+%define data(b) (b) + 8 ; FAST_FLOAT *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.rowloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+ ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
+ unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
+ unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+ ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
+ unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.columnloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+ ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
+ unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
+ unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+ ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
+ unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
+ unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add edx, byte 4*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .columnloop
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctfst-mmx.asm b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
new file mode 100644
index 0000000000..80645a50d7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
@@ -0,0 +1,395 @@
+;
+; jfdctfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.rowloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+ ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+ movq mm4, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm1 ; mm0=(20 30 21 31)
+ punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
+ movq mm5, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm3 ; mm2=(24 34 25 35)
+ punpckhwd mm5, mm3 ; mm5=(26 36 27 37)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+ ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
+
+ movq mm4, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
+ punpckhwd mm4, mm7 ; mm4=(02 12 03 13)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm3 ; mm1=(04 14 05 15)
+ punpckhwd mm2, mm3 ; mm2=(06 16 07 17)
+
+ movq mm7, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0
+ punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1
+ movq mm3, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6
+ punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7
+
+ movq mm0, mm7
+ movq mm5, mm6
+ psubw mm7, mm2 ; mm7=data1-data6=tmp6
+ psubw mm6, mm3 ; mm6=data0-data7=tmp7
+ paddw mm0, mm2 ; mm0=data1+data6=tmp1
+ paddw mm5, mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7, mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2
+ punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4
+ punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5
+
+ movq mm2, mm7
+ movq mm3, mm4
+ paddw mm7, mm1 ; mm7=data3+data4=tmp3
+ paddw mm4, mm6 ; mm4=data2+data5=tmp2
+ psubw mm2, mm1 ; mm2=data3-data4=tmp4
+ psubw mm3, mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1, mm5
+ movq mm6, mm0
+ psubw mm5, mm7 ; mm5=tmp13
+ psubw mm0, mm4 ; mm0=tmp12
+ paddw mm1, mm7 ; mm1=tmp10
+ paddw mm6, mm4 ; mm6=tmp11
+
+ paddw mm0, mm5
+ psllw mm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+ movq mm7, mm1
+ movq mm4, mm5
+ psubw mm1, mm6 ; mm1=data4
+ psubw mm5, mm0 ; mm5=data6
+ paddw mm7, mm6 ; mm7=data0
+ paddw mm4, mm0 ; mm4=data2
+
+ movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+ ; -- Odd part
+
+ movq mm6, MMWORD [wk(0)] ; mm6=tmp6
+ movq mm0, MMWORD [wk(1)] ; mm0=tmp7
+
+ paddw mm2, mm3 ; mm2=tmp10
+ paddw mm3, mm6 ; mm3=tmp11
+ paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7
+
+ psllw mm2, PRE_MULTIPLY_SCALE_BITS
+ psllw mm6, PRE_MULTIPLY_SCALE_BITS
+
+ psllw mm3, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+ movq mm1, mm2 ; mm1=tmp10
+ psubw mm2, mm6
+ pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5
+ pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+ pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+ paddw mm1, mm2 ; mm1=z2
+ paddw mm6, mm2 ; mm6=z4
+
+ movq mm5, mm0
+ psubw mm0, mm3 ; mm0=z13
+ paddw mm5, mm3 ; mm5=z11
+
+ movq mm7, mm0
+ movq mm4, mm5
+ psubw mm0, mm1 ; mm0=data3
+ psubw mm5, mm6 ; mm5=data7
+ paddw mm7, mm1 ; mm7=data5
+ paddw mm4, mm6 ; mm4=data1
+
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+ add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+ ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+ movq mm4, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm1 ; mm0=(02 03 12 13)
+ punpckhwd mm4, mm1 ; mm4=(22 23 32 33)
+ movq mm5, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm3 ; mm2=(42 43 52 53)
+ punpckhwd mm5, mm3 ; mm5=(62 63 72 73)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+ ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
+
+ movq mm4, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 01 10 11)
+ punpckhwd mm4, mm7 ; mm4=(20 21 30 31)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm3 ; mm1=(40 41 50 51)
+ punpckhwd mm2, mm3 ; mm2=(60 61 70 71)
+
+ movq mm7, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0
+ punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1
+ movq mm3, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6
+ punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7
+
+ movq mm0, mm7
+ movq mm5, mm6
+ psubw mm7, mm2 ; mm7=data1-data6=tmp6
+ psubw mm6, mm3 ; mm6=data0-data7=tmp7
+ paddw mm0, mm2 ; mm0=data1+data6=tmp1
+ paddw mm5, mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7, mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2
+ punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4
+ punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5
+
+ movq mm2, mm7
+ movq mm3, mm4
+ paddw mm7, mm1 ; mm7=data3+data4=tmp3
+ paddw mm4, mm6 ; mm4=data2+data5=tmp2
+ psubw mm2, mm1 ; mm2=data3-data4=tmp4
+ psubw mm3, mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1, mm5
+ movq mm6, mm0
+ psubw mm5, mm7 ; mm5=tmp13
+ psubw mm0, mm4 ; mm0=tmp12
+ paddw mm1, mm7 ; mm1=tmp10
+ paddw mm6, mm4 ; mm6=tmp11
+
+ paddw mm0, mm5
+ psllw mm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+ movq mm7, mm1
+ movq mm4, mm5
+ psubw mm1, mm6 ; mm1=data4
+ psubw mm5, mm0 ; mm5=data6
+ paddw mm7, mm6 ; mm7=data0
+ paddw mm4, mm0 ; mm4=data2
+
+ movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+ ; -- Odd part
+
+ movq mm6, MMWORD [wk(0)] ; mm6=tmp6
+ movq mm0, MMWORD [wk(1)] ; mm0=tmp7
+
+ paddw mm2, mm3 ; mm2=tmp10
+ paddw mm3, mm6 ; mm3=tmp11
+ paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7
+
+ psllw mm2, PRE_MULTIPLY_SCALE_BITS
+ psllw mm6, PRE_MULTIPLY_SCALE_BITS
+
+ psllw mm3, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+ movq mm1, mm2 ; mm1=tmp10
+ psubw mm2, mm6
+ pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5
+ pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+ pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+ paddw mm1, mm2 ; mm1=z2
+ paddw mm6, mm2 ; mm6=z4
+
+ movq mm5, mm0
+ psubw mm0, mm3 ; mm0=z13
+ paddw mm5, mm3 ; mm5=z11
+
+ movq mm7, mm0
+ movq mm4, mm5
+ psubw mm0, mm1 ; mm0=data3
+ psubw mm5, mm6 ; mm5=data7
+ paddw mm7, mm1 ; mm7=data5
+ paddw mm4, mm6 ; mm4=data1
+
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+ add edx, byte 4*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .columnloop
+
+ emms ; empty MMX state
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctfst-sse2.asm b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..446fa7a68f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
@@ -0,0 +1,403 @@
+;
+; jfdctfst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ psubw xmm3, xmm1 ; xmm3=tmp13
+ psubw xmm6, xmm7 ; xmm6=tmp12
+ paddw xmm4, xmm1 ; xmm4=tmp10
+ paddw xmm0, xmm7 ; xmm0=tmp11
+
+ paddw xmm6, xmm3
+ psllw xmm6, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1
+
+ movdqa xmm1, xmm4
+ movdqa xmm7, xmm3
+ psubw xmm4, xmm0 ; xmm4=data4
+ psubw xmm3, xmm6 ; xmm3=data6
+ paddw xmm1, xmm0 ; xmm1=data0
+ paddw xmm7, xmm6 ; xmm7=data2
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
+
+ ; -- Odd part
+
+ paddw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm5, xmm0 ; xmm5=tmp11
+ paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3
+
+ movdqa xmm4, xmm2 ; xmm4=tmp10
+ psubw xmm2, xmm0
+ pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5
+ pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm2 ; xmm4=z2
+ paddw xmm0, xmm2 ; xmm0=z4
+
+ movdqa xmm3, xmm6
+ psubw xmm6, xmm5 ; xmm6=z13
+ paddw xmm3, xmm5 ; xmm3=z11
+
+ movdqa xmm2, xmm6
+ movdqa xmm5, xmm3
+ psubw xmm6, xmm4 ; xmm6=data3
+ psubw xmm3, xmm0 ; xmm3=data7
+ paddw xmm2, xmm4 ; xmm2=data5
+ paddw xmm5, xmm0 ; xmm5=data1
+
+ ; ---- Pass 2: process columns.
+
+; mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+ ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
+
+ ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+ ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
+ movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
+ punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm5, xmm6
+ movdqa xmm3, xmm1
+ psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
+ psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
+ paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
+ paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
+
+ movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm7, xmm6
+ movdqa xmm0, xmm2
+ paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
+ paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
+ psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
+ psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm1, xmm5
+ psubw xmm3, xmm6 ; xmm3=tmp13
+ psubw xmm5, xmm2 ; xmm5=tmp12
+ paddw xmm4, xmm6 ; xmm4=tmp10
+ paddw xmm1, xmm2 ; xmm1=tmp11
+
+ paddw xmm5, xmm3
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1
+
+ movdqa xmm6, xmm4
+ movdqa xmm2, xmm3
+ psubw xmm4, xmm1 ; xmm4=data4
+ psubw xmm3, xmm5 ; xmm3=data6
+ paddw xmm6, xmm1 ; xmm6=data0
+ paddw xmm2, xmm5 ; xmm2=data2
+
+ movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+ ; -- Odd part
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ paddw xmm7, xmm0 ; xmm7=tmp10
+ paddw xmm0, xmm1 ; xmm0=tmp11
+ paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
+
+ psllw xmm7, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3
+
+ movdqa xmm4, xmm7 ; xmm4=tmp10
+ psubw xmm7, xmm1
+ pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5
+ pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm7 ; xmm4=z2
+ paddw xmm1, xmm7 ; xmm1=z4
+
+ movdqa xmm3, xmm5
+ psubw xmm5, xmm0 ; xmm5=z13
+ paddw xmm3, xmm0 ; xmm3=z11
+
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm3
+ psubw xmm5, xmm4 ; xmm5=data3
+ psubw xmm3, xmm1 ; xmm3=data7
+ paddw xmm6, xmm4 ; xmm6=data5
+ paddw xmm2, xmm1 ; xmm2=data1
+
+ movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+ movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctint-avx2.asm b/media/libjpeg/simd/i386/jfdctint-avx2.asm
new file mode 100644
index 0000000000..23cf733135
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-avx2.asm
@@ -0,0 +1,331 @@
+;
+; jfdctint.asm - accurate integer FDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+ ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+
+ vpunpcklwd %5, %1, %2
+ vpunpckhwd %6, %1, %2
+ vpunpcklwd %7, %3, %4
+ vpunpckhwd %8, %3, %4
+ ; transpose coefficients(phase 1)
+ ; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53)
+ ; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57)
+ ; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73)
+ ; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77)
+
+ vpunpckldq %1, %5, %7
+ vpunpckhdq %2, %5, %7
+ vpunpckldq %3, %6, %8
+ vpunpckhdq %4, %6, %8
+ ; transpose coefficients(phase 2)
+ ; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
+ ; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
+ ; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
+ ; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
+
+ vpermq %1, %1, 0x8D
+ vpermq %2, %2, 0x8D
+ vpermq %3, %3, 0xD8
+ vpermq %4, %4, 0xD8
+ ; transpose coefficients(phase 3)
+ ; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70)
+ ; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
+ ; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
+ ; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9: Pass (1 or 2)
+
+%macro dodct 9
+ vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7
+ vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0
+ vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2
+ vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5
+
+ ; -- Even part
+
+ vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1
+ vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11
+ vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12
+
+ vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10
+ vpsignw %1, %1, [GOTOFF(ebx, PW_1_NEG1)] ; %1=tmp10_neg11
+ vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+ vpsllw %1, %7, PASS1_BITS ; %1=data0_4
+%else
+ vpaddw %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)]
+ vpsraw %1, %7, PASS1_BITS ; %1=data0_4
+%endif
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13
+ vpunpcklwd %2, %6, %7
+ vpunpckhwd %6, %6, %7
+ vpmaddwd %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %2=data2_6L
+ vpmaddwd %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %6=data2_6H
+
+ vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpaddd %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpsrad %2, %2, DESCALE_P %+ %9
+ vpsrad %6, %6, DESCALE_P %+ %9
+
+ vpackssdw %3, %2, %6 ; %6=data2_6
+
+ ; -- Odd part
+
+ vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3
+ vpunpcklwd %6, %7, %2
+ vpunpckhwd %7, %7, %2
+ vpmaddwd %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %6=z3_4L
+ vpmaddwd %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %7=z3_4H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6
+ vpunpcklwd %2, %8, %4
+ vpunpckhwd %4, %8, %4
+ vpmaddwd %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %2=tmp4_5L
+ vpmaddwd %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %4=tmp4_5H
+
+ vpaddd %2, %2, %6 ; %2=data7_5L
+ vpaddd %4, %4, %7 ; %4=data7_5H
+
+ vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpaddd %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpsrad %2, %2, DESCALE_P %+ %9
+ vpsrad %4, %4, DESCALE_P %+ %9
+
+ vpackssdw %4, %2, %4 ; %4=data7_5
+
+ vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4
+ vpunpcklwd %8, %5, %2
+ vpunpckhwd %5, %5, %2
+ vpmaddwd %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %8=tmp6_7L
+ vpmaddwd %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %5=tmp6_7H
+
+ vpaddd %8, %8, %6 ; %8=data3_1L
+ vpaddd %5, %5, %7 ; %5=data3_1H
+
+ vpaddd %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpaddd %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpsrad %8, %8, DESCALE_P %+ %9
+ vpsrad %5, %5, DESCALE_P %+ %9
+
+ vpackssdw %2, %8, %5 ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+ times 4 dw (F_0_541 - F_1_847), F_0_541
+PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+ times 4 dw (F_1_175 - F_0_390), F_1_175
+PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
+ times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562
+ times 4 dw (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1)
+PW_1_NEG1 times 8 dw 1
+ times 8 dw -1
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+ push ebp
+ mov ebp, esp
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(ebp)] ; (DCTELEM *)
+
+ vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ ; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ ; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ ; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ ; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ vperm2i128 ymm0, ymm4, ymm6, 0x20
+ vperm2i128 ymm1, ymm4, ymm6, 0x31
+ vperm2i128 ymm2, ymm5, ymm7, 0x20
+ vperm2i128 ymm3, ymm5, ymm7, 0x31
+ ; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+
+ dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+ dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+ ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+ ; ---- Pass 2: process columns.
+
+ vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7
+ vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5
+
+ dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+ dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+ ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+ vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1
+ vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3
+ vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5
+ vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7
+
+ vmovdqu YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3
+ vmovdqu YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5
+ vmovdqu YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6
+ vmovdqu YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7
+
+ vzeroupper
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctint-mmx.asm b/media/libjpeg/simd/i386/jfdctint-mmx.asm
new file mode 100644
index 0000000000..34a43b9e5e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-mmx.asm
@@ -0,0 +1,620 @@
+;
+; jfdctint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.rowloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+ ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+ movq mm4, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm1 ; mm0=(20 30 21 31)
+ punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
+ movq mm5, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm3 ; mm2=(24 34 25 35)
+ punpckhwd mm5, mm3 ; mm5=(26 36 27 37)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+ ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
+
+ movq mm4, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
+ punpckhwd mm4, mm7 ; mm4=(02 12 03 13)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm3 ; mm1=(04 14 05 15)
+ punpckhwd mm2, mm3 ; mm2=(06 16 07 17)
+
+ movq mm7, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0
+ punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1
+ movq mm3, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6
+ punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7
+
+ movq mm0, mm7
+ movq mm5, mm6
+ psubw mm7, mm2 ; mm7=data1-data6=tmp6
+ psubw mm6, mm3 ; mm6=data0-data7=tmp7
+ paddw mm0, mm2 ; mm0=data1+data6=tmp1
+ paddw mm5, mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7, mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2
+ punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4
+ punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5
+
+ movq mm2, mm7
+ movq mm3, mm4
+ paddw mm7, mm1 ; mm7=data3+data4=tmp3
+ paddw mm4, mm6 ; mm4=data2+data5=tmp2
+ psubw mm2, mm1 ; mm2=data3-data4=tmp4
+ psubw mm3, mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1, mm5
+ movq mm6, mm0
+ paddw mm5, mm7 ; mm5=tmp10
+ paddw mm0, mm4 ; mm0=tmp11
+ psubw mm1, mm7 ; mm1=tmp13
+ psubw mm6, mm4 ; mm6=tmp12
+
+ movq mm7, mm5
+ paddw mm5, mm0 ; mm5=tmp10+tmp11
+ psubw mm7, mm0 ; mm7=tmp10-tmp11
+
+ psllw mm5, PASS1_BITS ; mm5=data0
+ psllw mm7, PASS1_BITS ; mm7=data4
+
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movq mm4, mm1 ; mm1=tmp13
+ movq mm0, mm1
+ punpcklwd mm4, mm6 ; mm6=tmp12
+ punpckhwd mm0, mm6
+ movq mm1, mm4
+ movq mm6, mm0
+ pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
+ pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
+ pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
+ pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
+
+ paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm4, DESCALE_P1
+ psrad mm0, DESCALE_P1
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm1, DESCALE_P1
+ psrad mm6, DESCALE_P1
+
+ packssdw mm4, mm0 ; mm4=data2
+ packssdw mm1, mm6 ; mm1=data6
+
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+ ; -- Odd part
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp6
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp7
+
+ movq mm0, mm2 ; mm2=tmp4
+ movq mm6, mm3 ; mm3=tmp5
+ paddw mm0, mm5 ; mm0=z3
+ paddw mm6, mm7 ; mm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm4, mm0
+ movq mm1, mm0
+ punpcklwd mm4, mm6
+ punpckhwd mm1, mm6
+ movq mm0, mm4
+ movq mm6, mm1
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
+ pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
+ pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
+ movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movq mm4, mm2
+ movq mm1, mm2
+ punpcklwd mm4, mm7
+ punpckhwd mm1, mm7
+ movq mm2, mm4
+ movq mm7, mm1
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
+
+ paddd mm4, MMWORD [wk(0)] ; mm4=data7L
+ paddd mm1, MMWORD [wk(1)] ; mm1=data7H
+ paddd mm2, mm0 ; mm2=data1L
+ paddd mm7, mm6 ; mm7=data1H
+
+ paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm4, DESCALE_P1
+ psrad mm1, DESCALE_P1
+ paddd mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm2, DESCALE_P1
+ psrad mm7, DESCALE_P1
+
+ packssdw mm4, mm1 ; mm4=data7
+ packssdw mm2, mm7 ; mm2=data1
+
+ movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+ movq mm1, mm3
+ movq mm7, mm3
+ punpcklwd mm1, mm5
+ punpckhwd mm7, mm5
+ movq mm3, mm1
+ movq mm5, mm7
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
+ pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
+ pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
+
+ paddd mm1, mm0 ; mm1=data5L
+ paddd mm7, mm6 ; mm7=data5H
+ paddd mm3, MMWORD [wk(0)] ; mm3=data3L
+ paddd mm5, MMWORD [wk(1)] ; mm5=data3H
+
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm1, DESCALE_P1
+ psrad mm7, DESCALE_P1
+ paddd mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm3, DESCALE_P1
+ psrad mm5, DESCALE_P1
+
+ packssdw mm1, mm7 ; mm1=data5
+ packssdw mm3, mm5 ; mm3=data3
+
+ movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+ add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+ ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+ movq mm4, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm1 ; mm0=(02 03 12 13)
+ punpckhwd mm4, mm1 ; mm4=(22 23 32 33)
+ movq mm5, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm3 ; mm2=(42 43 52 53)
+ punpckhwd mm5, mm3 ; mm5=(62 63 72 73)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+ ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
+
+ movq mm4, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 01 10 11)
+ punpckhwd mm4, mm7 ; mm4=(20 21 30 31)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm3 ; mm1=(40 41 50 51)
+ punpckhwd mm2, mm3 ; mm2=(60 61 70 71)
+
+ movq mm7, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0
+ punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1
+ movq mm3, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6
+ punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7
+
+ movq mm0, mm7
+ movq mm5, mm6
+ psubw mm7, mm2 ; mm7=data1-data6=tmp6
+ psubw mm6, mm3 ; mm6=data0-data7=tmp7
+ paddw mm0, mm2 ; mm0=data1+data6=tmp1
+ paddw mm5, mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7, mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2
+ punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4
+ punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5
+
+ movq mm2, mm7
+ movq mm3, mm4
+ paddw mm7, mm1 ; mm7=data3+data4=tmp3
+ paddw mm4, mm6 ; mm4=data2+data5=tmp2
+ psubw mm2, mm1 ; mm2=data3-data4=tmp4
+ psubw mm3, mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1, mm5
+ movq mm6, mm0
+ paddw mm5, mm7 ; mm5=tmp10
+ paddw mm0, mm4 ; mm0=tmp11
+ psubw mm1, mm7 ; mm1=tmp13
+ psubw mm6, mm4 ; mm6=tmp12
+
+ movq mm7, mm5
+ paddw mm5, mm0 ; mm5=tmp10+tmp11
+ psubw mm7, mm0 ; mm7=tmp10-tmp11
+
+ paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ psraw mm5, PASS1_BITS ; mm5=data0
+ psraw mm7, PASS1_BITS ; mm7=data4
+
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movq mm4, mm1 ; mm1=tmp13
+ movq mm0, mm1
+ punpcklwd mm4, mm6 ; mm6=tmp12
+ punpckhwd mm0, mm6
+ movq mm1, mm4
+ movq mm6, mm0
+ pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
+ pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
+ pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
+ pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
+
+ paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm4, DESCALE_P2
+ psrad mm0, DESCALE_P2
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm1, DESCALE_P2
+ psrad mm6, DESCALE_P2
+
+ packssdw mm4, mm0 ; mm4=data2
+ packssdw mm1, mm6 ; mm1=data6
+
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+ ; -- Odd part
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp6
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp7
+
+ movq mm0, mm2 ; mm2=tmp4
+ movq mm6, mm3 ; mm3=tmp5
+ paddw mm0, mm5 ; mm0=z3
+ paddw mm6, mm7 ; mm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm4, mm0
+ movq mm1, mm0
+ punpcklwd mm4, mm6
+ punpckhwd mm1, mm6
+ movq mm0, mm4
+ movq mm6, mm1
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
+ pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
+ pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
+ movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movq mm4, mm2
+ movq mm1, mm2
+ punpcklwd mm4, mm7
+ punpckhwd mm1, mm7
+ movq mm2, mm4
+ movq mm7, mm1
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
+
+ paddd mm4, MMWORD [wk(0)] ; mm4=data7L
+ paddd mm1, MMWORD [wk(1)] ; mm1=data7H
+ paddd mm2, mm0 ; mm2=data1L
+ paddd mm7, mm6 ; mm7=data1H
+
+ paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm4, DESCALE_P2
+ psrad mm1, DESCALE_P2
+ paddd mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm2, DESCALE_P2
+ psrad mm7, DESCALE_P2
+
+ packssdw mm4, mm1 ; mm4=data7
+ packssdw mm2, mm7 ; mm2=data1
+
+ movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+ movq mm1, mm3
+ movq mm7, mm3
+ punpcklwd mm1, mm5
+ punpckhwd mm7, mm5
+ movq mm3, mm1
+ movq mm5, mm7
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
+ pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
+ pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
+
+ paddd mm1, mm0 ; mm1=data5L
+ paddd mm7, mm6 ; mm7=data5H
+ paddd mm3, MMWORD [wk(0)] ; mm3=data3L
+ paddd mm5, MMWORD [wk(1)] ; mm5=data3H
+
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm1, DESCALE_P2
+ psrad mm7, DESCALE_P2
+ paddd mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm3, DESCALE_P2
+ psrad mm5, DESCALE_P2
+
+ packssdw mm1, mm7 ; mm1=data5
+ packssdw mm3, mm5 ; mm3=data3
+
+ movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+ add edx, byte 4*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .columnloop
+
+ emms ; empty MMX state
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctint-sse2.asm b/media/libjpeg/simd/i386/jfdctint-sse2.asm
new file mode 100644
index 0000000000..6f8e18cb9d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-sse2.asm
@@ -0,0 +1,633 @@
+;
+; jfdctint.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 6
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ paddw xmm3, xmm1 ; xmm3=tmp10
+ paddw xmm6, xmm7 ; xmm6=tmp11
+ psubw xmm4, xmm1 ; xmm4=tmp13
+ psubw xmm0, xmm7 ; xmm0=tmp12
+
+ movdqa xmm1, xmm3
+ paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
+ psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
+
+ psllw xmm3, PASS1_BITS ; xmm3=data0
+ psllw xmm1, PASS1_BITS ; xmm1=data4
+
+ movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
+ movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm7, xmm4 ; xmm4=tmp13
+ movdqa xmm6, xmm4
+ punpcklwd xmm7, xmm0 ; xmm0=tmp12
+ punpckhwd xmm6, xmm0
+ movdqa xmm4, xmm7
+ movdqa xmm0, xmm6
+ pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
+
+ paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm7, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm4, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm7, xmm6 ; xmm7=data2
+ packssdw xmm4, xmm0 ; xmm4=data6
+
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
+
+ ; -- Odd part
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
+
+ movdqa xmm6, xmm2 ; xmm2=tmp4
+ movdqa xmm0, xmm5 ; xmm5=tmp5
+ paddw xmm6, xmm3 ; xmm6=z3
+ paddw xmm0, xmm1 ; xmm0=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm7, xmm6
+ movdqa xmm4, xmm6
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm4, xmm0
+ movdqa xmm6, xmm7
+ movdqa xmm0, xmm4
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ punpcklwd xmm7, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm2, xmm7
+ movdqa xmm1, xmm4
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
+
+ paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
+ paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
+ paddd xmm2, xmm6 ; xmm2=data1L
+ paddd xmm1, xmm0 ; xmm1=data1H
+
+ paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm7, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm2, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+
+ packssdw xmm7, xmm4 ; xmm7=data7
+ packssdw xmm2, xmm1 ; xmm2=data1
+
+ movdqa xmm4, xmm5
+ movdqa xmm1, xmm5
+ punpcklwd xmm4, xmm3
+ punpckhwd xmm1, xmm3
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm1
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
+ pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
+
+ paddd xmm4, xmm6 ; xmm4=data5L
+ paddd xmm1, xmm0 ; xmm1=data5H
+ paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
+ paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
+
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm4, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+ paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm5, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+
+ packssdw xmm4, xmm1 ; xmm4=data5
+ packssdw xmm5, xmm3 ; xmm5=data3
+
+ ; ---- Pass 2: process columns.
+
+; mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
+ movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
+
+ ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+ ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
+ movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
+
+ ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+ ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
+ movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm2, xmm5
+ movdqa xmm7, xmm6
+ psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
+ psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
+ paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
+
+ movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm0, xmm5
+ movdqa xmm3, xmm4
+ paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
+ paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
+ psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm1, xmm7
+ movdqa xmm6, xmm2
+ paddw xmm7, xmm5 ; xmm7=tmp10
+ paddw xmm2, xmm4 ; xmm2=tmp11
+ psubw xmm1, xmm5 ; xmm1=tmp13
+ psubw xmm6, xmm4 ; xmm6=tmp12
+
+ movdqa xmm5, xmm7
+ paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
+ psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
+
+ paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ psraw xmm7, PASS1_BITS ; xmm7=data0
+ psraw xmm5, PASS1_BITS ; xmm5=data4
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+ movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm4, xmm1 ; xmm1=tmp13
+ movdqa xmm2, xmm1
+ punpcklwd xmm4, xmm6 ; xmm6=tmp12
+ punpckhwd xmm2, xmm6
+ movdqa xmm1, xmm4
+ movdqa xmm6, xmm2
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
+
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm4, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm1, DESCALE_P2
+ psrad xmm6, DESCALE_P2
+
+ packssdw xmm4, xmm2 ; xmm4=data2
+ packssdw xmm1, xmm6 ; xmm1=data6
+
+ movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+ ; -- Odd part
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ movdqa xmm2, xmm0 ; xmm0=tmp4
+ movdqa xmm6, xmm3 ; xmm3=tmp5
+ paddw xmm2, xmm7 ; xmm2=z3
+ paddw xmm6, xmm5 ; xmm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm4, xmm2
+ movdqa xmm1, xmm2
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm1, xmm6
+ movdqa xmm2, xmm4
+ movdqa xmm6, xmm1
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm4, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm4, xmm5
+ punpckhwd xmm1, xmm5
+ movdqa xmm0, xmm4
+ movdqa xmm5, xmm1
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
+
+ paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
+ paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
+ paddd xmm0, xmm2 ; xmm0=data1L
+ paddd xmm5, xmm6 ; xmm5=data1H
+
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm4, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm0, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+
+ packssdw xmm4, xmm1 ; xmm4=data7
+ packssdw xmm0, xmm5 ; xmm0=data1
+
+ movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+ movdqa xmm1, xmm3
+ movdqa xmm5, xmm3
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm5, xmm7
+ movdqa xmm3, xmm1
+ movdqa xmm7, xmm5
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
+
+ paddd xmm1, xmm2 ; xmm1=data5L
+ paddd xmm5, xmm6 ; xmm5=data5H
+ paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
+ paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
+
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm1, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+ paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm3, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm1, xmm5 ; xmm1=data5
+ packssdw xmm3, xmm7 ; xmm3=data3
+
+ movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctflt-3dn.asm b/media/libjpeg/simd/i386/jidctflt-3dn.asm
new file mode 100644
index 0000000000..87951910d8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-3dn.asm
@@ -0,0 +1,451 @@
+;
+; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414 times 2 dd 1.414213562373095048801689
+PD_1_847 times 2 dd 1.847759065022573512256366
+PD_1_082 times 2 dd 1.082392200292393968799446
+PD_2_613 times 2 dd 2.613125929752753055713286
+PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT *wsptr
+ mov ecx, DCTSIZE/2 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ pushpic ebx ; save GOT address
+ mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ or eax, ebx
+ poppic ebx ; restore GOT address
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd mm0, mm0
+ psrad mm0, (DWORD_BIT-WORD_BIT)
+ pi2fd mm0, mm0
+
+ pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movq mm1, mm0
+ punpckldq mm0, mm0
+ punpckhdq mm1, mm1
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd mm0, mm0
+ punpcklwd mm1, mm1
+ psrad mm0, (DWORD_BIT-WORD_BIT)
+ psrad mm1, (DWORD_BIT-WORD_BIT)
+ pi2fd mm0, mm0
+ pi2fd mm1, mm1
+
+ pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ punpcklwd mm2, mm2
+ punpcklwd mm3, mm3
+ psrad mm2, (DWORD_BIT-WORD_BIT)
+ psrad mm3, (DWORD_BIT-WORD_BIT)
+ pi2fd mm2, mm2
+ pi2fd mm3, mm3
+
+ pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ pfsub mm0, mm2 ; mm0=tmp11
+ pfsub mm1, mm3
+ pfadd mm4, mm2 ; mm4=tmp10
+ pfadd mm5, mm3 ; mm5=tmp13
+
+ pfmul mm1, [GOTOFF(ebx,PD_1_414)]
+ pfsub mm1, mm5 ; mm1=tmp12
+
+ movq mm6, mm4
+ movq mm7, mm0
+ pfsub mm4, mm5 ; mm4=tmp3
+ pfsub mm0, mm1 ; mm0=tmp2
+ pfadd mm6, mm5 ; mm6=tmp0
+ pfadd mm7, mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; tmp3
+ movq MMWORD [wk(0)], mm0 ; tmp2
+
+ ; -- Odd part
+
+ movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd mm2, mm2
+ punpcklwd mm3, mm3
+ psrad mm2, (DWORD_BIT-WORD_BIT)
+ psrad mm3, (DWORD_BIT-WORD_BIT)
+ pi2fd mm2, mm2
+ pi2fd mm3, mm3
+
+ pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ punpcklwd mm5, mm5
+ punpcklwd mm1, mm1
+ psrad mm5, (DWORD_BIT-WORD_BIT)
+ psrad mm1, (DWORD_BIT-WORD_BIT)
+ pi2fd mm5, mm5
+ pi2fd mm1, mm1
+
+ pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movq mm4, mm2
+ movq mm0, mm5
+ pfadd mm2, mm1 ; mm2=z11
+ pfadd mm5, mm3 ; mm5=z13
+ pfsub mm4, mm1 ; mm4=z12
+ pfsub mm0, mm3 ; mm0=z10
+
+ movq mm1, mm2
+ pfsub mm2, mm5
+ pfadd mm1, mm5 ; mm1=tmp7
+
+ pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
+
+ movq mm3, mm0
+ pfadd mm0, mm4
+ pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5
+ pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
+ pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
+ pfsubr mm3, mm0 ; mm3=tmp12
+ pfsub mm4, mm0 ; mm4=tmp10
+
+ ; -- Final output stage
+
+ pfsub mm3, mm1 ; mm3=tmp6
+ movq mm5, mm6
+ movq mm0, mm7
+ pfadd mm6, mm1 ; mm6=data0=(00 01)
+ pfadd mm7, mm3 ; mm7=data1=(10 11)
+ pfsub mm5, mm1 ; mm5=data7=(70 71)
+ pfsub mm0, mm3 ; mm0=data6=(60 61)
+ pfsub mm2, mm3 ; mm2=tmp5
+
+ movq mm1, mm6 ; transpose coefficients
+ punpckldq mm6, mm7 ; mm6=(00 10)
+ punpckhdq mm1, mm7 ; mm1=(01 11)
+ movq mm3, mm0 ; transpose coefficients
+ punpckldq mm0, mm5 ; mm0=(60 70)
+ punpckhdq mm3, mm5 ; mm3=(61 71)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp2
+ movq mm5, MMWORD [wk(1)] ; mm5=tmp3
+
+ pfadd mm4, mm2 ; mm4=tmp4
+ movq mm6, mm7
+ movq mm1, mm5
+ pfadd mm7, mm2 ; mm7=data2=(20 21)
+ pfadd mm5, mm4 ; mm5=data4=(40 41)
+ pfsub mm6, mm2 ; mm6=data5=(50 51)
+ pfsub mm1, mm4 ; mm1=data3=(30 31)
+
+ movq mm0, mm7 ; transpose coefficients
+ punpckldq mm7, mm1 ; mm7=(20 30)
+ punpckhdq mm0, mm1 ; mm0=(21 31)
+ movq mm3, mm5 ; transpose coefficients
+ punpckldq mm5, mm6 ; mm5=(40 50)
+ punpckhdq mm3, mm6 ; mm3=(41 51)
+
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+ movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+ add esi, byte 2*SIZEOF_JCOEF ; coef_block
+ add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/2 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ pfsub mm0, mm2 ; mm0=tmp11
+ pfsub mm1, mm3
+ pfadd mm4, mm2 ; mm4=tmp10
+ pfadd mm5, mm3 ; mm5=tmp13
+
+ pfmul mm1, [GOTOFF(ebx,PD_1_414)]
+ pfsub mm1, mm5 ; mm1=tmp12
+
+ movq mm6, mm4
+ movq mm7, mm0
+ pfsub mm4, mm5 ; mm4=tmp3
+ pfsub mm0, mm1 ; mm0=tmp2
+ pfadd mm6, mm5 ; mm6=tmp0
+ pfadd mm7, mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; tmp3
+ movq MMWORD [wk(0)], mm0 ; tmp2
+
+ ; -- Odd part
+
+ movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movq mm4, mm2
+ movq mm0, mm5
+ pfadd mm2, mm1 ; mm2=z11
+ pfadd mm5, mm3 ; mm5=z13
+ pfsub mm4, mm1 ; mm4=z12
+ pfsub mm0, mm3 ; mm0=z10
+
+ movq mm1, mm2
+ pfsub mm2, mm5
+ pfadd mm1, mm5 ; mm1=tmp7
+
+ pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
+
+ movq mm3, mm0
+ pfadd mm0, mm4
+ pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5
+ pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
+ pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
+ pfsubr mm3, mm0 ; mm3=tmp12
+ pfsub mm4, mm0 ; mm4=tmp10
+
+ ; -- Final output stage
+
+ pfsub mm3, mm1 ; mm3=tmp6
+ movq mm5, mm6
+ movq mm0, mm7
+ pfadd mm6, mm1 ; mm6=data0=(00 10)
+ pfadd mm7, mm3 ; mm7=data1=(01 11)
+ pfsub mm5, mm1 ; mm5=data7=(07 17)
+ pfsub mm0, mm3 ; mm0=data6=(06 16)
+ pfsub mm2, mm3 ; mm2=tmp5
+
+ movq mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC]
+ pcmpeqd mm3, mm3
+ psrld mm3, WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+ pfadd mm6, mm1 ; mm6=roundint(data0/8)=(00 ** 10 **)
+ pfadd mm7, mm1 ; mm7=roundint(data1/8)=(01 ** 11 **)
+ pfadd mm0, mm1 ; mm0=roundint(data6/8)=(06 ** 16 **)
+ pfadd mm5, mm1 ; mm5=roundint(data7/8)=(07 ** 17 **)
+
+ pand mm6, mm3 ; mm6=(00 -- 10 --)
+ pslld mm7, WORD_BIT ; mm7=(-- 01 -- 11)
+ pand mm0, mm3 ; mm0=(06 -- 16 --)
+ pslld mm5, WORD_BIT ; mm5=(-- 07 -- 17)
+ por mm6, mm7 ; mm6=(00 01 10 11)
+ por mm0, mm5 ; mm0=(06 07 16 17)
+
+ movq mm1, MMWORD [wk(0)] ; mm1=tmp2
+ movq mm3, MMWORD [wk(1)] ; mm3=tmp3
+
+ pfadd mm4, mm2 ; mm4=tmp4
+ movq mm7, mm1
+ movq mm5, mm3
+ pfadd mm1, mm2 ; mm1=data2=(02 12)
+ pfadd mm3, mm4 ; mm3=data4=(04 14)
+ pfsub mm7, mm2 ; mm7=data5=(05 15)
+ pfsub mm5, mm4 ; mm5=data3=(03 13)
+
+ movq mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC]
+ pcmpeqd mm4, mm4
+ psrld mm4, WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+ pfadd mm3, mm2 ; mm3=roundint(data4/8)=(04 ** 14 **)
+ pfadd mm7, mm2 ; mm7=roundint(data5/8)=(05 ** 15 **)
+ pfadd mm1, mm2 ; mm1=roundint(data2/8)=(02 ** 12 **)
+ pfadd mm5, mm2 ; mm5=roundint(data3/8)=(03 ** 13 **)
+
+ pand mm3, mm4 ; mm3=(04 -- 14 --)
+ pslld mm7, WORD_BIT ; mm7=(-- 05 -- 15)
+ pand mm1, mm4 ; mm1=(02 -- 12 --)
+ pslld mm5, WORD_BIT ; mm5=(-- 03 -- 13)
+ por mm3, mm7 ; mm3=(04 05 14 15)
+ por mm1, mm5 ; mm1=(02 03 12 13)
+
+ movq mm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP]
+
+ packsswb mm6, mm3 ; mm6=(00 01 10 11 04 05 14 15)
+ packsswb mm1, mm0 ; mm1=(02 03 12 13 06 07 16 17)
+ paddb mm6, mm2
+ paddb mm1, mm2
+
+ movq mm4, mm6 ; transpose coefficients(phase 2)
+ punpcklwd mm6, mm1 ; mm6=(00 01 02 03 10 11 12 13)
+ punpckhwd mm4, mm1 ; mm4=(04 05 06 07 14 15 16 17)
+
+ movq mm7, mm6 ; transpose coefficients(phase 3)
+ punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07)
+ punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 2*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ femms ; empty MMX/3DNow! state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse.asm b/media/libjpeg/simd/i386/jidctflt-sse.asm
new file mode 100644
index 0000000000..b27ecfdf46
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse.asm
@@ -0,0 +1,571 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_0_125 times 4 dd 0.125 ; 1/8
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm1, mm0
+ packsswb mm1, mm1
+ movd eax, mm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpckhwd mm1, mm0 ; mm1=(** 02 ** 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03)
+ psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
+ cvtpi2ps xmm3, mm1 ; xmm3=(02 03 ** **)
+ cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **)
+ movlhps xmm0, xmm3 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, xmm0
+
+ shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpckhwd mm4, mm0 ; mm4=(** 02 ** 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm5, mm1 ; mm5=(** 22 ** 23)
+ punpcklwd mm1, mm1 ; mm1=(20 20 21 21)
+
+ psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03)
+ psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
+ cvtpi2ps xmm4, mm4 ; xmm4=(02 03 ** **)
+ cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **)
+ psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23)
+ psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21)
+ cvtpi2ps xmm5, mm5 ; xmm5=(22 23 ** **)
+ cvtpi2ps xmm1, mm1 ; xmm1=(20 21 ** **)
+
+ punpckhwd mm6, mm2 ; mm6=(** 42 ** 43)
+ punpcklwd mm2, mm2 ; mm2=(40 40 41 41)
+ punpckhwd mm7, mm3 ; mm7=(** 62 ** 63)
+ punpcklwd mm3, mm3 ; mm3=(60 60 61 61)
+
+ psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43)
+ psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41)
+ cvtpi2ps xmm6, mm6 ; xmm6=(42 43 ** **)
+ cvtpi2ps xmm2, mm2 ; xmm2=(40 41 ** **)
+ psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63)
+ psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61)
+ cvtpi2ps xmm7, mm7 ; xmm7=(62 63 ** **)
+ cvtpi2ps xmm3, mm3 ; xmm3=(60 61 ** **)
+
+ movlhps xmm0, xmm4 ; xmm0=in0=(00 01 02 03)
+ movlhps xmm1, xmm5 ; xmm1=in2=(20 21 22 23)
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movlhps xmm2, xmm6 ; xmm2=in4=(40 41 42 43)
+ movlhps xmm3, xmm7 ; xmm3=in6=(60 61 62 63)
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpckhwd mm6, mm4 ; mm6=(** 12 ** 13)
+ punpcklwd mm4, mm4 ; mm4=(10 10 11 11)
+ punpckhwd mm2, mm0 ; mm2=(** 32 ** 33)
+ punpcklwd mm0, mm0 ; mm0=(30 30 31 31)
+
+ psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13)
+ psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11)
+ cvtpi2ps xmm4, mm6 ; xmm4=(12 13 ** **)
+ cvtpi2ps xmm2, mm4 ; xmm2=(10 11 ** **)
+ psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33)
+ psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31)
+ cvtpi2ps xmm0, mm2 ; xmm0=(32 33 ** **)
+ cvtpi2ps xmm3, mm0 ; xmm3=(30 31 ** **)
+
+ punpckhwd mm7, mm5 ; mm7=(** 52 ** 53)
+ punpcklwd mm5, mm5 ; mm5=(50 50 51 51)
+ punpckhwd mm3, mm1 ; mm3=(** 72 ** 73)
+ punpcklwd mm1, mm1 ; mm1=(70 70 71 71)
+
+ movlhps xmm2, xmm4 ; xmm2=in1=(10 11 12 13)
+ movlhps xmm3, xmm0 ; xmm3=in3=(30 31 32 33)
+
+ psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53)
+ psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51)
+ cvtpi2ps xmm4, mm7 ; xmm4=(52 53 ** **)
+ cvtpi2ps xmm5, mm5 ; xmm5=(50 51 ** **)
+ psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73)
+ psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71)
+ cvtpi2ps xmm0, mm3 ; xmm0=(72 73 ** **)
+ cvtpi2ps xmm1, mm1 ; xmm1=(70 71 ** **)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movlhps xmm5, xmm4 ; xmm5=in5=(50 51 52 53)
+ movlhps xmm1, xmm0 ; xmm1=in7=(70 71 72 73)
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm0, xmm7
+ movaps xmm3, xmm5
+ addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2, xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4, xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6, xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, [GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125]
+
+ mulps xmm6, xmm1 ; descale(1/8)
+ mulps xmm7, xmm1 ; descale(1/8)
+ mulps xmm5, xmm1 ; descale(1/8)
+ mulps xmm0, xmm1 ; descale(1/8)
+
+ movhlps xmm3, xmm6
+ movhlps xmm1, xmm7
+ cvtps2pi mm0, xmm6 ; round to int32, mm0=data0L=(00 10)
+ cvtps2pi mm1, xmm7 ; round to int32, mm1=data1L=(01 11)
+ cvtps2pi mm2, xmm3 ; round to int32, mm2=data0H=(20 30)
+ cvtps2pi mm3, xmm1 ; round to int32, mm3=data1H=(21 31)
+ packssdw mm0, mm2 ; mm0=data0=(00 10 20 30)
+ packssdw mm1, mm3 ; mm1=data1=(01 11 21 31)
+
+ movhlps xmm6, xmm5
+ movhlps xmm7, xmm0
+ cvtps2pi mm4, xmm5 ; round to int32, mm4=data7L=(07 17)
+ cvtps2pi mm5, xmm0 ; round to int32, mm5=data6L=(06 16)
+ cvtps2pi mm6, xmm6 ; round to int32, mm6=data7H=(27 37)
+ cvtps2pi mm7, xmm7 ; round to int32, mm7=data6H=(26 36)
+ packssdw mm4, mm6 ; mm4=data7=(07 17 27 37)
+ packssdw mm5, mm7 ; mm5=data6=(06 16 26 36)
+
+ packsswb mm0, mm5 ; mm0=(00 10 20 30 06 16 26 36)
+ packsswb mm1, mm4 ; mm1=(01 11 21 31 07 17 27 37)
+
+ movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2
+ movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movaps xmm6, [GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125]
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm5, xmm3
+ movaps xmm0, xmm1
+ addps xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
+ addps xmm1, xmm4 ; xmm1=data4=(04 14 24 34)
+ subps xmm5, xmm2 ; xmm5=data5=(05 15 25 35)
+ subps xmm0, xmm4 ; xmm0=data3=(03 13 23 33)
+
+ mulps xmm3, xmm6 ; descale(1/8)
+ mulps xmm1, xmm6 ; descale(1/8)
+ mulps xmm5, xmm6 ; descale(1/8)
+ mulps xmm0, xmm6 ; descale(1/8)
+
+ movhlps xmm7, xmm3
+ movhlps xmm2, xmm1
+ cvtps2pi mm2, xmm3 ; round to int32, mm2=data2L=(02 12)
+ cvtps2pi mm3, xmm1 ; round to int32, mm3=data4L=(04 14)
+ cvtps2pi mm6, xmm7 ; round to int32, mm6=data2H=(22 32)
+ cvtps2pi mm7, xmm2 ; round to int32, mm7=data4H=(24 34)
+ packssdw mm2, mm6 ; mm2=data2=(02 12 22 32)
+ packssdw mm3, mm7 ; mm3=data4=(04 14 24 34)
+
+ movhlps xmm4, xmm5
+ movhlps xmm6, xmm0
+ cvtps2pi mm5, xmm5 ; round to int32, mm5=data5L=(05 15)
+ cvtps2pi mm4, xmm0 ; round to int32, mm4=data3L=(03 13)
+ cvtps2pi mm6, xmm4 ; round to int32, mm6=data5H=(25 35)
+ cvtps2pi mm7, xmm6 ; round to int32, mm7=data3H=(23 33)
+ packssdw mm5, mm6 ; mm5=data5=(05 15 25 35)
+ packssdw mm4, mm7 ; mm4=data3=(03 13 23 33)
+
+ movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
+
+ packsswb mm2, mm3 ; mm2=(02 12 22 32 04 14 24 34)
+ packsswb mm4, mm5 ; mm4=(03 13 23 33 05 15 25 35)
+
+ paddb mm0, mm6
+ paddb mm1, mm6
+ paddb mm2, mm6
+ paddb mm4, mm6
+
+ movq mm7, mm0 ; transpose coefficients(phase 1)
+ punpcklbw mm0, mm1 ; mm0=(00 01 10 11 20 21 30 31)
+ punpckhbw mm7, mm1 ; mm7=(06 07 16 17 26 27 36 37)
+ movq mm3, mm2 ; transpose coefficients(phase 1)
+ punpcklbw mm2, mm4 ; mm2=(02 03 12 13 22 23 32 33)
+ punpckhbw mm3, mm4 ; mm3=(04 05 14 15 24 25 34 35)
+
+ movq mm5, mm0 ; transpose coefficients(phase 2)
+ punpcklwd mm0, mm2 ; mm0=(00 01 02 03 10 11 12 13)
+ punpckhwd mm5, mm2 ; mm5=(20 21 22 23 30 31 32 33)
+ movq mm6, mm3 ; transpose coefficients(phase 2)
+ punpcklwd mm3, mm7 ; mm3=(04 05 06 07 14 15 16 17)
+ punpckhwd mm6, mm7 ; mm6=(24 25 26 27 34 35 36 37)
+
+ movq mm1, mm0 ; transpose coefficients(phase 3)
+ punpckldq mm0, mm3 ; mm0=(00 01 02 03 04 05 06 07)
+ punpckhdq mm1, mm3 ; mm1=(10 11 12 13 14 15 16 17)
+ movq mm4, mm5 ; transpose coefficients(phase 3)
+ punpckldq mm5, mm6 ; mm5=(20 21 22 23 24 25 26 27)
+ punpckhdq mm4, mm6 ; mm4=(30 31 32 33 34 35 36 37)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse2.asm b/media/libjpeg/simd/i386/jidctflt-sse2.asm
new file mode 100644
index 0000000000..c646eaef76
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse2.asm
@@ -0,0 +1,497 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1, xmm2
+ por xmm3, xmm4
+ por xmm5, xmm6
+ por xmm1, xmm3
+ por xmm5, xmm7
+ por xmm1, xmm5
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, xmm0
+
+ shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
+
+ punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
+ punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
+ punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
+
+ punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
+ punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
+ psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
+ cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm0, xmm7
+ movaps xmm3, xmm5
+ addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2, xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4, xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6, xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
+ pcmpeqd xmm3, xmm3
+ psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+ addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+ addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+ addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+ pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+ pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+ por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
+ por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
+
+ movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm7, xmm1
+ movaps xmm5, xmm3
+ addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
+ addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
+ subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
+ subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
+
+ movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
+ pcmpeqd xmm4, xmm4
+ psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+ addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+ addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+ addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+ pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+ pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+ por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
+ por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
+
+ packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+ packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+ paddb xmm6, xmm2
+ paddb xmm1, xmm2
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
+ punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+ pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctfst-mmx.asm b/media/libjpeg/simd/i386/jidctfst-mmx.asm
new file mode 100644
index 0000000000..24622d4369
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-mmx.asm
@@ -0,0 +1,499 @@
+;
+; jidctfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+ ; JCOEF workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; JCOEF *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm1, mm0
+ packsswb mm1, mm1
+ movd eax, mm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movq mm2, mm0 ; mm0=in0=(00 01 02 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
+
+ movq mm1, mm0
+ punpckldq mm0, mm0 ; mm0=(00 00 00 00)
+ punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
+ movq mm3, mm2
+ punpckldq mm2, mm2 ; mm2=(02 02 02 02)
+ punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ psubw mm0, mm2 ; mm0=tmp11
+ psubw mm1, mm3
+ paddw mm4, mm2 ; mm4=tmp10
+ paddw mm5, mm3 ; mm5=tmp13
+
+ psllw mm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm1, [GOTOFF(ebx,PW_F1414)]
+ psubw mm1, mm5 ; mm1=tmp12
+
+ movq mm6, mm4
+ movq mm7, mm0
+ psubw mm4, mm5 ; mm4=tmp3
+ psubw mm0, mm1 ; mm0=tmp2
+ paddw mm6, mm5 ; mm6=tmp0
+ paddw mm7, mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
+ movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movq mm4, mm2
+ movq mm0, mm5
+ psubw mm2, mm1 ; mm2=z12
+ psubw mm5, mm3 ; mm5=z10
+ paddw mm4, mm1 ; mm4=z11
+ paddw mm0, mm3 ; mm0=z13
+
+ movq mm1, mm5 ; mm1=z10(unscaled)
+ psllw mm2, PRE_MULTIPLY_SCALE_BITS
+ psllw mm5, PRE_MULTIPLY_SCALE_BITS
+
+ movq mm3, mm4
+ psubw mm4, mm0
+ paddw mm3, mm0 ; mm3=tmp7
+
+ psllw mm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movq mm0, mm5
+ paddw mm5, mm2
+ pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5
+ pmulhw mm0, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw mm2, [GOTOFF(ebx,PW_F1082)]
+ psubw mm0, mm1
+ psubw mm2, mm5 ; mm2=tmp10
+ paddw mm0, mm5 ; mm0=tmp12
+
+ ; -- Final output stage
+
+ psubw mm0, mm3 ; mm0=tmp6
+ movq mm1, mm6
+ movq mm5, mm7
+ paddw mm6, mm3 ; mm6=data0=(00 01 02 03)
+ paddw mm7, mm0 ; mm7=data1=(10 11 12 13)
+ psubw mm1, mm3 ; mm1=data7=(70 71 72 73)
+ psubw mm5, mm0 ; mm5=data6=(60 61 62 63)
+ psubw mm4, mm0 ; mm4=tmp5
+
+ movq mm3, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
+ punpckhwd mm3, mm7 ; mm3=(02 12 03 13)
+ movq mm0, mm5 ; transpose coefficients(phase 1)
+ punpcklwd mm5, mm1 ; mm5=(60 70 61 71)
+ punpckhwd mm0, mm1 ; mm0=(62 72 63 73)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp2
+ movq mm1, MMWORD [wk(1)] ; mm1=tmp3
+
+ movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
+ movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
+
+ paddw mm2, mm4 ; mm2=tmp4
+ movq mm5, mm7
+ movq mm0, mm1
+ paddw mm7, mm4 ; mm7=data2=(20 21 22 23)
+ paddw mm1, mm2 ; mm1=data4=(40 41 42 43)
+ psubw mm5, mm4 ; mm5=data5=(50 51 52 53)
+ psubw mm0, mm2 ; mm0=data3=(30 31 32 33)
+
+ movq mm4, mm7 ; transpose coefficients(phase 1)
+ punpcklwd mm7, mm0 ; mm7=(20 30 21 31)
+ punpckhwd mm4, mm0 ; mm4=(22 32 23 33)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm5 ; mm1=(40 50 41 51)
+ punpckhwd mm2, mm5 ; mm2=(42 52 43 53)
+
+ movq mm0, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm7 ; mm6=(00 10 20 30)
+ punpckhdq mm0, mm7 ; mm0=(01 11 21 31)
+ movq mm5, mm3 ; transpose coefficients(phase 2)
+ punpckldq mm3, mm4 ; mm3=(02 12 22 32)
+ punpckhdq mm5, mm4 ; mm5=(03 13 23 33)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
+ movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm7 ; mm1=(40 50 60 70)
+ punpckhdq mm6, mm7 ; mm6=(41 51 61 71)
+ movq mm0, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm4 ; mm2=(42 52 62 72)
+ punpckhdq mm0, mm4 ; mm0=(43 53 63 73)
+
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
+ add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; JCOEF *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ psubw mm0, mm2 ; mm0=tmp11
+ psubw mm1, mm3
+ paddw mm4, mm2 ; mm4=tmp10
+ paddw mm5, mm3 ; mm5=tmp13
+
+ psllw mm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm1, [GOTOFF(ebx,PW_F1414)]
+ psubw mm1, mm5 ; mm1=tmp12
+
+ movq mm6, mm4
+ movq mm7, mm0
+ psubw mm4, mm5 ; mm4=tmp3
+ psubw mm0, mm1 ; mm0=tmp2
+ paddw mm6, mm5 ; mm6=tmp0
+ paddw mm7, mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
+ movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ movq mm4, mm2
+ movq mm0, mm5
+ psubw mm2, mm1 ; mm2=z12
+ psubw mm5, mm3 ; mm5=z10
+ paddw mm4, mm1 ; mm4=z11
+ paddw mm0, mm3 ; mm0=z13
+
+ movq mm1, mm5 ; mm1=z10(unscaled)
+ psllw mm2, PRE_MULTIPLY_SCALE_BITS
+ psllw mm5, PRE_MULTIPLY_SCALE_BITS
+
+ movq mm3, mm4
+ psubw mm4, mm0
+ paddw mm3, mm0 ; mm3=tmp7
+
+ psllw mm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movq mm0, mm5
+ paddw mm5, mm2
+ pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5
+ pmulhw mm0, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw mm2, [GOTOFF(ebx,PW_F1082)]
+ psubw mm0, mm1
+ psubw mm2, mm5 ; mm2=tmp10
+ paddw mm0, mm5 ; mm0=tmp12
+
+ ; -- Final output stage
+
+ psubw mm0, mm3 ; mm0=tmp6
+ movq mm1, mm6
+ movq mm5, mm7
+ paddw mm6, mm3 ; mm6=data0=(00 10 20 30)
+ paddw mm7, mm0 ; mm7=data1=(01 11 21 31)
+ psraw mm6, (PASS1_BITS+3) ; descale
+ psraw mm7, (PASS1_BITS+3) ; descale
+ psubw mm1, mm3 ; mm1=data7=(07 17 27 37)
+ psubw mm5, mm0 ; mm5=data6=(06 16 26 36)
+ psraw mm1, (PASS1_BITS+3) ; descale
+ psraw mm5, (PASS1_BITS+3) ; descale
+ psubw mm4, mm0 ; mm4=tmp5
+
+ packsswb mm6, mm5 ; mm6=(00 10 20 30 06 16 26 36)
+ packsswb mm7, mm1 ; mm7=(01 11 21 31 07 17 27 37)
+
+ movq mm3, MMWORD [wk(0)] ; mm3=tmp2
+ movq mm0, MMWORD [wk(1)] ; mm0=tmp3
+
+ paddw mm2, mm4 ; mm2=tmp4
+ movq mm5, mm3
+ movq mm1, mm0
+ paddw mm3, mm4 ; mm3=data2=(02 12 22 32)
+ paddw mm0, mm2 ; mm0=data4=(04 14 24 34)
+ psraw mm3, (PASS1_BITS+3) ; descale
+ psraw mm0, (PASS1_BITS+3) ; descale
+ psubw mm5, mm4 ; mm5=data5=(05 15 25 35)
+ psubw mm1, mm2 ; mm1=data3=(03 13 23 33)
+ psraw mm5, (PASS1_BITS+3) ; descale
+ psraw mm1, (PASS1_BITS+3) ; descale
+
+ movq mm4, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
+
+ packsswb mm3, mm0 ; mm3=(02 12 22 32 04 14 24 34)
+ packsswb mm1, mm5 ; mm1=(03 13 23 33 05 15 25 35)
+
+ paddb mm6, mm4
+ paddb mm7, mm4
+ paddb mm3, mm4
+ paddb mm1, mm4
+
+ movq mm2, mm6 ; transpose coefficients(phase 1)
+ punpcklbw mm6, mm7 ; mm6=(00 01 10 11 20 21 30 31)
+ punpckhbw mm2, mm7 ; mm2=(06 07 16 17 26 27 36 37)
+ movq mm0, mm3 ; transpose coefficients(phase 1)
+ punpcklbw mm3, mm1 ; mm3=(02 03 12 13 22 23 32 33)
+ punpckhbw mm0, mm1 ; mm0=(04 05 14 15 24 25 34 35)
+
+ movq mm5, mm6 ; transpose coefficients(phase 2)
+ punpcklwd mm6, mm3 ; mm6=(00 01 02 03 10 11 12 13)
+ punpckhwd mm5, mm3 ; mm5=(20 21 22 23 30 31 32 33)
+ movq mm4, mm0 ; transpose coefficients(phase 2)
+ punpcklwd mm0, mm2 ; mm0=(04 05 06 07 14 15 16 17)
+ punpckhwd mm4, mm2 ; mm4=(24 25 26 27 34 35 36 37)
+
+ movq mm7, mm6 ; transpose coefficients(phase 3)
+ punpckldq mm6, mm0 ; mm6=(00 01 02 03 04 05 06 07)
+ punpckhdq mm7, mm0 ; mm7=(10 11 12 13 14 15 16 17)
+ movq mm1, mm5 ; transpose coefficients(phase 3)
+ punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27)
+ punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_JCOEF ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctfst-sse2.asm b/media/libjpeg/simd/i386/jidctfst-sse2.asm
new file mode 100644
index 0000000000..19704ffa48
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-sse2.asm
@@ -0,0 +1,501 @@
+;
+; jidctfst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
+ jmp near .column_end
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ psubw xmm0, xmm2 ; xmm0=tmp11
+ psubw xmm1, xmm3
+ paddw xmm4, xmm2 ; xmm4=tmp10
+ paddw xmm5, xmm3 ; xmm5=tmp13
+
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm1, [GOTOFF(ebx,PW_F1414)]
+ psubw xmm1, xmm5 ; xmm1=tmp12
+
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm0
+ psubw xmm4, xmm5 ; xmm4=tmp3
+ psubw xmm0, xmm1 ; xmm0=tmp2
+ paddw xmm6, xmm5 ; xmm6=tmp0
+ paddw xmm7, xmm1 ; xmm7=tmp1
+
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm2
+ movdqa xmm0, xmm5
+ psubw xmm2, xmm1 ; xmm2=z12
+ psubw xmm5, xmm3 ; xmm5=z10
+ paddw xmm4, xmm1 ; xmm4=z11
+ paddw xmm0, xmm3 ; xmm0=z13
+
+ movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm3, xmm4
+ psubw xmm4, xmm0
+ paddw xmm3, xmm0 ; xmm3=tmp7
+
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm0, xmm5
+ paddw xmm5, xmm2
+ pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5
+ pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw xmm2, [GOTOFF(ebx,PW_F1082)]
+ psubw xmm0, xmm1
+ psubw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm0, xmm5 ; xmm0=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm0, xmm3 ; xmm0=tmp6
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm7
+ paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
+ paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
+ psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
+ psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
+ psubw xmm4, xmm0 ; xmm4=tmp5
+
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
+ movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
+
+ paddw xmm2, xmm4 ; xmm2=tmp4
+ movdqa xmm5, xmm7
+ movdqa xmm0, xmm1
+ paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
+ paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
+ psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+ psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
+ punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
+
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
+ punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
+ punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+ movdqa xmm2, xmm6
+ movdqa xmm0, xmm5
+ psubw xmm6, xmm1 ; xmm6=tmp11
+ psubw xmm5, xmm3
+ paddw xmm2, xmm1 ; xmm2=tmp10
+ paddw xmm0, xmm3 ; xmm0=tmp13
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [GOTOFF(ebx,PW_F1414)]
+ psubw xmm5, xmm0 ; xmm5=tmp12
+
+ movdqa xmm1, xmm2
+ movdqa xmm3, xmm6
+ psubw xmm2, xmm0 ; xmm2=tmp3
+ psubw xmm6, xmm5 ; xmm6=tmp2
+ paddw xmm1, xmm0 ; xmm1=tmp0
+ paddw xmm3, xmm5 ; xmm3=tmp1
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
+
+ ; -- Odd part
+
+ ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm4
+ psubw xmm0, xmm7 ; xmm0=z12
+ psubw xmm4, xmm5 ; xmm4=z10
+ paddw xmm2, xmm7 ; xmm2=z11
+ paddw xmm6, xmm5 ; xmm6=z13
+
+ movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm5, xmm2
+ psubw xmm2, xmm6
+ paddw xmm5, xmm6 ; xmm5=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm6, xmm4
+ paddw xmm4, xmm0
+ pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5
+ pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw xmm0, [GOTOFF(ebx,PW_F1082)]
+ psubw xmm6, xmm7
+ psubw xmm0, xmm4 ; xmm0=tmp10
+ paddw xmm6, xmm4 ; xmm6=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm6, xmm5 ; xmm6=tmp6
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm3
+ paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
+ paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ psraw xmm1, (PASS1_BITS+3) ; descale
+ psraw xmm3, (PASS1_BITS+3) ; descale
+ psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
+ psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+ psraw xmm7, (PASS1_BITS+3) ; descale
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psubw xmm2, xmm6 ; xmm2=tmp5
+
+ packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
+
+ paddw xmm0, xmm2 ; xmm0=tmp4
+ movdqa xmm4, xmm5
+ movdqa xmm7, xmm6
+ paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
+ paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
+ psraw xmm5, (PASS1_BITS+3) ; descale
+ psraw xmm6, (PASS1_BITS+3) ; descale
+ psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+ psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psraw xmm7, (PASS1_BITS+3) ; descale
+
+ movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
+
+ packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm1, xmm2
+ paddb xmm3, xmm2
+ paddb xmm5, xmm2
+ paddb xmm7, xmm2
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
+ punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+ mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctint-avx2.asm b/media/libjpeg/simd/i386/jidctint-avx2.asm
new file mode 100644
index 0000000000..199c7df3b6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-avx2.asm
@@ -0,0 +1,453 @@
+;
+; jidctint.asm - accurate integer IDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+ ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71)
+ ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
+ ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
+ ; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76)
+
+ vpermq %5, %1, 0xD8
+ vpermq %6, %2, 0x72
+ vpermq %7, %3, 0xD8
+ vpermq %8, %4, 0x72
+ ; transpose coefficients(phase 1)
+ ; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
+ ; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
+ ; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
+ ; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
+
+ vpunpcklwd %1, %5, %6
+ vpunpckhwd %2, %5, %6
+ vpunpcklwd %3, %7, %8
+ vpunpckhwd %4, %7, %8
+ ; transpose coefficients(phase 2)
+ ; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72)
+ ; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73)
+ ; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76)
+ ; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77)
+
+ vpunpcklwd %5, %1, %2
+ vpunpcklwd %6, %3, %4
+ vpunpckhwd %7, %1, %2
+ vpunpckhwd %8, %3, %4
+ ; transpose coefficients(phase 3)
+ ; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53)
+ ; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57)
+ ; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73)
+ ; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77)
+
+ vpunpcklqdq %1, %5, %6
+ vpunpckhqdq %2, %5, %6
+ vpunpcklqdq %3, %7, %8
+ vpunpckhqdq %4, %7, %8
+ ; transpose coefficients(phase 4)
+ ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%12: Temp registers
+; %9: Pass (1 or 2)
+
+%macro dodct 13
+ ; -- Even part
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2
+ vpunpcklwd %5, %3, %6 ; %5=in26_62L
+ vpunpckhwd %6, %3, %6 ; %6=in26_62H
+ vpmaddwd %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %5=tmp3_2L
+ vpmaddwd %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %6=tmp3_2H
+
+ vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0
+ vpsignw %1, %1, [GOTOFF(ebx,PW_1_NEG1)]
+ vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4)
+
+ vpxor %1, %1, %1
+ vpunpcklwd %8, %1, %7 ; %8=tmp0_1L
+ vpunpckhwd %1, %1, %7 ; %1=tmp0_1H
+ vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS
+ vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+ vpsubd %3, %8, %5
+ vmovdqu %11, %3 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+ vpaddd %3, %8, %5
+ vmovdqu %9, %3 ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+ vpsubd %3, %1, %6
+ vmovdqu %12, %3 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+ vpaddd %3, %1, %6
+ vmovdqu %10, %3 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+ ; -- Odd part
+
+ vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3
+ vpunpcklwd %7, %1, %8 ; %7=z34_43L
+ vpunpckhwd %8, %1, %8 ; %8=z34_43H
+ vpmaddwd %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %7=z3_4L
+ vpmaddwd %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %8=z3_4H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3
+ vpunpcklwd %3, %4, %2 ; %3=in71_53L
+ vpunpckhwd %4, %4, %2 ; %4=in71_53H
+
+ vpmaddwd %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %5=tmp0_1L
+ vpmaddwd %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %6=tmp0_1H
+ vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L
+ vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+ vpmaddwd %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %3=tmp3_2L
+ vpmaddwd %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %4=tmp3_2H
+ vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L
+ vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H
+ vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L
+ vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+ ; -- Final output stage
+
+ vmovdqu %3, %9
+ vmovdqu %4, %10
+
+ vpaddd %1, %3, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L
+ vpaddd %2, %4, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H
+ vpaddd %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpsrad %1, %1, DESCALE_P %+ %13
+ vpsrad %2, %2, DESCALE_P %+ %13
+ vpackssdw %1, %1, %2 ; %1=data0_1
+
+ vpsubd %3, %3, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L
+ vpsubd %4, %4, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H
+ vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpaddd %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpsrad %3, %3, DESCALE_P %+ %13
+ vpsrad %4, %4, DESCALE_P %+ %13
+ vpackssdw %4, %3, %4 ; %4=data7_6
+
+ vmovdqu %7, %11
+ vmovdqu %8, %12
+
+ vpaddd %2, %7, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L
+ vpaddd %3, %8, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H
+ vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpsrad %2, %2, DESCALE_P %+ %13
+ vpsrad %3, %3, DESCALE_P %+ %13
+ vpackssdw %2, %2, %3 ; %2=data3_2
+
+ vpsubd %3, %7, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L
+ vpsubd %6, %8, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H
+ vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpaddd %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpsrad %3, %3, DESCALE_P %+ %13
+ vpsrad %6, %6, DESCALE_P %+ %13
+ vpackssdw %3, %3, %6 ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+ times 4 dw (F_0_541 - F_1_847), F_0_541
+PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+ times 4 dw (F_1_175 - F_0_390), F_1_175
+PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
+ times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+ times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 32 db CENTERJSAMPLE
+PW_1_NEG1 times 8 dw 1
+ times 8 dw -1
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 4
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, xmm0
+ vpacksswb xmm1, xmm1, xmm1
+ vpacksswb xmm1, xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ vpsllw xmm5, xmm5, PASS1_BITS
+
+ vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03)
+ vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07)
+ vinserti128 ymm4, ymm4, xmm5, 1
+
+ vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04)
+ vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05)
+ vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06)
+ vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07)
+
+ jmp near .column_end
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)] ; ymm4=in0_1
+ vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)] ; ymm5=in2_3
+ vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)] ; ymm6=in4_5
+ vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)] ; ymm7=in6_7
+ vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4
+ vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1
+ vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6
+ vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5
+
+ dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
+ ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+ dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+ ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5
+ vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1
+
+ dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
+ ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+ dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+ ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+ vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45
+ vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67
+ vpaddb ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)]
+ vpaddb ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ vextracti128 xmm6, ymm1, 1 ; xmm3=data67
+ vextracti128 xmm4, ymm0, 1 ; xmm2=data45
+ vextracti128 xmm2, ymm1, 0 ; xmm1=data23
+ vextracti128 xmm0, ymm0, 0 ; xmm0=data01
+
+ vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ vzeroupper
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+ mov edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctint-mmx.asm b/media/libjpeg/simd/i386/jidctint-mmx.asm
new file mode 100644
index 0000000000..f15c8d34bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-mmx.asm
@@ -0,0 +1,851 @@
+;
+; jidctint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 12
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+ ; JCOEF workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; JCOEF *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm1, mm0
+ packsswb mm1, mm1
+ movd eax, mm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw mm0, PASS1_BITS
+
+ movq mm2, mm0 ; mm0=in0=(00 01 02 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
+
+ movq mm1, mm0
+ punpckldq mm0, mm0 ; mm0=(00 00 00 00)
+ punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
+ movq mm3, mm2
+ punpckldq mm2, mm2 ; mm2=(02 02 02 02)
+ punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movq mm4, mm1 ; mm1=in2=z2
+ movq mm5, mm1
+ punpcklwd mm4, mm3 ; mm3=in6=z3
+ punpckhwd mm5, mm3
+ movq mm1, mm4
+ movq mm3, mm5
+ pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
+ pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
+ pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
+ pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
+
+ movq mm6, mm0
+ paddw mm0, mm2 ; mm0=in0+in4
+ psubw mm6, mm2 ; mm6=in0-in4
+
+ pxor mm7, mm7
+ pxor mm2, mm2
+ punpcklwd mm7, mm0 ; mm7=tmp0L
+ punpckhwd mm2, mm0 ; mm2=tmp0H
+ psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+ psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+ movq mm0, mm7
+ paddd mm7, mm4 ; mm7=tmp10L
+ psubd mm0, mm4 ; mm0=tmp13L
+ movq mm4, mm2
+ paddd mm2, mm5 ; mm2=tmp10H
+ psubd mm4, mm5 ; mm4=tmp13H
+
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
+ movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
+ movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
+ movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
+
+ pxor mm5, mm5
+ pxor mm7, mm7
+ punpcklwd mm5, mm6 ; mm5=tmp1L
+ punpckhwd mm7, mm6 ; mm7=tmp1H
+ psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
+ psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+ movq mm2, mm5
+ paddd mm5, mm1 ; mm5=tmp11L
+ psubd mm2, mm1 ; mm2=tmp12L
+ movq mm0, mm7
+ paddd mm7, mm3 ; mm7=tmp11H
+ psubd mm0, mm3 ; mm0=tmp12H
+
+ movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
+ movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
+ movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
+ movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movq mm5, mm6
+ movq mm7, mm4
+ paddw mm5, mm3 ; mm5=z3
+ paddw mm7, mm1 ; mm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm2, mm5
+ movq mm0, mm5
+ punpcklwd mm2, mm7
+ punpckhwd mm0, mm7
+ movq mm5, mm2
+ movq mm7, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
+ pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
+ pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movq mm2, mm3
+ movq mm0, mm3
+ punpcklwd mm2, mm4
+ punpckhwd mm0, mm4
+ movq mm3, mm2
+ movq mm4, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
+ pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
+
+ paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
+ paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
+ paddd mm3, mm5 ; mm3=tmp3L
+ paddd mm4, mm7 ; mm4=tmp3H
+
+ movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
+ movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
+
+ movq mm2, mm1
+ movq mm0, mm1
+ punpcklwd mm2, mm6
+ punpckhwd mm0, mm6
+ movq mm1, mm2
+ movq mm6, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
+ pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
+
+ paddd mm2, mm5 ; mm2=tmp1L
+ paddd mm0, mm7 ; mm0=tmp1H
+ paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
+ paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
+
+ movq mm2, mm5
+ movq mm0, mm7
+ paddd mm5, mm3 ; mm5=data0L
+ paddd mm7, mm4 ; mm7=data0H
+ psubd mm2, mm3 ; mm2=data7L
+ psubd mm0, mm4 ; mm0=data7H
+
+ movq mm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
+
+ paddd mm5, mm3
+ paddd mm7, mm3
+ psrad mm5, DESCALE_P1
+ psrad mm7, DESCALE_P1
+ paddd mm2, mm3
+ paddd mm0, mm3
+ psrad mm2, DESCALE_P1
+ psrad mm0, DESCALE_P1
+
+ packssdw mm5, mm7 ; mm5=data0=(00 01 02 03)
+ packssdw mm2, mm0 ; mm2=data7=(70 71 72 73)
+
+ movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
+ movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
+
+ movq mm7, mm4
+ movq mm0, mm3
+ paddd mm4, mm1 ; mm4=data1L
+ paddd mm3, mm6 ; mm3=data1H
+ psubd mm7, mm1 ; mm7=data6L
+ psubd mm0, mm6 ; mm0=data6H
+
+ movq mm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
+
+ paddd mm4, mm1
+ paddd mm3, mm1
+ psrad mm4, DESCALE_P1
+ psrad mm3, DESCALE_P1
+ paddd mm7, mm1
+ paddd mm0, mm1
+ psrad mm7, DESCALE_P1
+ psrad mm0, DESCALE_P1
+
+ packssdw mm4, mm3 ; mm4=data1=(10 11 12 13)
+ packssdw mm7, mm0 ; mm7=data6=(60 61 62 63)
+
+ movq mm6, mm5 ; transpose coefficients(phase 1)
+ punpcklwd mm5, mm4 ; mm5=(00 10 01 11)
+ punpckhwd mm6, mm4 ; mm6=(02 12 03 13)
+ movq mm1, mm7 ; transpose coefficients(phase 1)
+ punpcklwd mm7, mm2 ; mm7=(60 70 61 71)
+ punpckhwd mm1, mm2 ; mm1=(62 72 63 73)
+
+ movq mm3, MMWORD [wk(6)] ; mm3=tmp12L
+ movq mm0, MMWORD [wk(7)] ; mm0=tmp12H
+ movq mm4, MMWORD [wk(10)] ; mm4=tmp1L
+ movq mm2, MMWORD [wk(11)] ; mm2=tmp1H
+
+ movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11)
+ movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13)
+ movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71)
+ movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73)
+
+ movq mm5, mm3
+ movq mm6, mm0
+ paddd mm3, mm4 ; mm3=data2L
+ paddd mm0, mm2 ; mm0=data2H
+ psubd mm5, mm4 ; mm5=data5L
+ psubd mm6, mm2 ; mm6=data5H
+
+ movq mm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
+
+ paddd mm3, mm7
+ paddd mm0, mm7
+ psrad mm3, DESCALE_P1
+ psrad mm0, DESCALE_P1
+ paddd mm5, mm7
+ paddd mm6, mm7
+ psrad mm5, DESCALE_P1
+ psrad mm6, DESCALE_P1
+
+ packssdw mm3, mm0 ; mm3=data2=(20 21 22 23)
+ packssdw mm5, mm6 ; mm5=data5=(50 51 52 53)
+
+ movq mm1, MMWORD [wk(2)] ; mm1=tmp13L
+ movq mm4, MMWORD [wk(3)] ; mm4=tmp13H
+ movq mm2, MMWORD [wk(8)] ; mm2=tmp0L
+ movq mm7, MMWORD [wk(9)] ; mm7=tmp0H
+
+ movq mm0, mm1
+ movq mm6, mm4
+ paddd mm1, mm2 ; mm1=data3L
+ paddd mm4, mm7 ; mm4=data3H
+ psubd mm0, mm2 ; mm0=data4L
+ psubd mm6, mm7 ; mm6=data4H
+
+ movq mm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
+
+ paddd mm1, mm2
+ paddd mm4, mm2
+ psrad mm1, DESCALE_P1
+ psrad mm4, DESCALE_P1
+ paddd mm0, mm2
+ paddd mm6, mm2
+ psrad mm0, DESCALE_P1
+ psrad mm6, DESCALE_P1
+
+ packssdw mm1, mm4 ; mm1=data3=(30 31 32 33)
+ packssdw mm0, mm6 ; mm0=data4=(40 41 42 43)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11)
+ movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13)
+
+ movq mm4, mm3 ; transpose coefficients(phase 1)
+ punpcklwd mm3, mm1 ; mm3=(20 30 21 31)
+ punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
+ movq mm6, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm5 ; mm0=(40 50 41 51)
+ punpckhwd mm6, mm5 ; mm6=(42 52 43 53)
+
+ movq mm1, mm7 ; transpose coefficients(phase 2)
+ punpckldq mm7, mm3 ; mm7=(00 10 20 30)
+ punpckhdq mm1, mm3 ; mm1=(01 11 21 31)
+ movq mm5, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm4 ; mm2=(02 12 22 32)
+ punpckhdq mm5, mm4 ; mm5=(03 13 23 33)
+
+ movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71)
+ movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+ movq mm7, mm0 ; transpose coefficients(phase 2)
+ punpckldq mm0, mm3 ; mm0=(40 50 60 70)
+ punpckhdq mm7, mm3 ; mm7=(41 51 61 71)
+ movq mm1, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm4 ; mm6=(42 52 62 72)
+ punpckhdq mm1, mm4 ; mm1=(43 53 63 73)
+
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
+ add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; JCOEF *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movq mm4, mm1 ; mm1=in2=z2
+ movq mm5, mm1
+ punpcklwd mm4, mm3 ; mm3=in6=z3
+ punpckhwd mm5, mm3
+ movq mm1, mm4
+ movq mm3, mm5
+ pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
+ pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
+ pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
+ pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
+
+ movq mm6, mm0
+ paddw mm0, mm2 ; mm0=in0+in4
+ psubw mm6, mm2 ; mm6=in0-in4
+
+ pxor mm7, mm7
+ pxor mm2, mm2
+ punpcklwd mm7, mm0 ; mm7=tmp0L
+ punpckhwd mm2, mm0 ; mm2=tmp0H
+ psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+ psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+ movq mm0, mm7
+ paddd mm7, mm4 ; mm7=tmp10L
+ psubd mm0, mm4 ; mm0=tmp13L
+ movq mm4, mm2
+ paddd mm2, mm5 ; mm2=tmp10H
+ psubd mm4, mm5 ; mm4=tmp13H
+
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
+ movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
+ movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
+ movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
+
+ pxor mm5, mm5
+ pxor mm7, mm7
+ punpcklwd mm5, mm6 ; mm5=tmp1L
+ punpckhwd mm7, mm6 ; mm7=tmp1H
+ psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
+ psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+ movq mm2, mm5
+ paddd mm5, mm1 ; mm5=tmp11L
+ psubd mm2, mm1 ; mm2=tmp12L
+ movq mm0, mm7
+ paddd mm7, mm3 ; mm7=tmp11H
+ psubd mm0, mm3 ; mm0=tmp12H
+
+ movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
+ movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
+ movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
+ movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ movq mm5, mm6
+ movq mm7, mm4
+ paddw mm5, mm3 ; mm5=z3
+ paddw mm7, mm1 ; mm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm2, mm5
+ movq mm0, mm5
+ punpcklwd mm2, mm7
+ punpckhwd mm0, mm7
+ movq mm5, mm2
+ movq mm7, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
+ pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
+ pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movq mm2, mm3
+ movq mm0, mm3
+ punpcklwd mm2, mm4
+ punpckhwd mm0, mm4
+ movq mm3, mm2
+ movq mm4, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
+ pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
+
+ paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
+ paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
+ paddd mm3, mm5 ; mm3=tmp3L
+ paddd mm4, mm7 ; mm4=tmp3H
+
+ movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
+ movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
+
+ movq mm2, mm1
+ movq mm0, mm1
+ punpcklwd mm2, mm6
+ punpckhwd mm0, mm6
+ movq mm1, mm2
+ movq mm6, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
+ pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
+
+ paddd mm2, mm5 ; mm2=tmp1L
+ paddd mm0, mm7 ; mm0=tmp1H
+ paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
+ paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
+
+ movq mm2, mm5
+ movq mm0, mm7
+ paddd mm5, mm3 ; mm5=data0L
+ paddd mm7, mm4 ; mm7=data0H
+ psubd mm2, mm3 ; mm2=data7L
+ psubd mm0, mm4 ; mm0=data7H
+
+ movq mm3, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
+
+ paddd mm5, mm3
+ paddd mm7, mm3
+ psrad mm5, DESCALE_P2
+ psrad mm7, DESCALE_P2
+ paddd mm2, mm3
+ paddd mm0, mm3
+ psrad mm2, DESCALE_P2
+ psrad mm0, DESCALE_P2
+
+ packssdw mm5, mm7 ; mm5=data0=(00 10 20 30)
+ packssdw mm2, mm0 ; mm2=data7=(07 17 27 37)
+
+ movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
+ movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
+
+ movq mm7, mm4
+ movq mm0, mm3
+ paddd mm4, mm1 ; mm4=data1L
+ paddd mm3, mm6 ; mm3=data1H
+ psubd mm7, mm1 ; mm7=data6L
+ psubd mm0, mm6 ; mm0=data6H
+
+ movq mm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
+
+ paddd mm4, mm1
+ paddd mm3, mm1
+ psrad mm4, DESCALE_P2
+ psrad mm3, DESCALE_P2
+ paddd mm7, mm1
+ paddd mm0, mm1
+ psrad mm7, DESCALE_P2
+ psrad mm0, DESCALE_P2
+
+ packssdw mm4, mm3 ; mm4=data1=(01 11 21 31)
+ packssdw mm7, mm0 ; mm7=data6=(06 16 26 36)
+
+ packsswb mm5, mm7 ; mm5=(00 10 20 30 06 16 26 36)
+ packsswb mm4, mm2 ; mm4=(01 11 21 31 07 17 27 37)
+
+ movq mm6, MMWORD [wk(6)] ; mm6=tmp12L
+ movq mm1, MMWORD [wk(7)] ; mm1=tmp12H
+ movq mm3, MMWORD [wk(10)] ; mm3=tmp1L
+ movq mm0, MMWORD [wk(11)] ; mm0=tmp1H
+
+ movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36)
+ movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37)
+
+ movq mm7, mm6
+ movq mm2, mm1
+ paddd mm6, mm3 ; mm6=data2L
+ paddd mm1, mm0 ; mm1=data2H
+ psubd mm7, mm3 ; mm7=data5L
+ psubd mm2, mm0 ; mm2=data5H
+
+ movq mm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
+
+ paddd mm6, mm5
+ paddd mm1, mm5
+ psrad mm6, DESCALE_P2
+ psrad mm1, DESCALE_P2
+ paddd mm7, mm5
+ paddd mm2, mm5
+ psrad mm7, DESCALE_P2
+ psrad mm2, DESCALE_P2
+
+ packssdw mm6, mm1 ; mm6=data2=(02 12 22 32)
+ packssdw mm7, mm2 ; mm7=data5=(05 15 25 35)
+
+ movq mm4, MMWORD [wk(2)] ; mm4=tmp13L
+ movq mm3, MMWORD [wk(3)] ; mm3=tmp13H
+ movq mm0, MMWORD [wk(8)] ; mm0=tmp0L
+ movq mm5, MMWORD [wk(9)] ; mm5=tmp0H
+
+ movq mm1, mm4
+ movq mm2, mm3
+ paddd mm4, mm0 ; mm4=data3L
+ paddd mm3, mm5 ; mm3=data3H
+ psubd mm1, mm0 ; mm1=data4L
+ psubd mm2, mm5 ; mm2=data4H
+
+ movq mm0, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
+
+ paddd mm4, mm0
+ paddd mm3, mm0
+ psrad mm4, DESCALE_P2
+ psrad mm3, DESCALE_P2
+ paddd mm1, mm0
+ paddd mm2, mm0
+ psrad mm1, DESCALE_P2
+ psrad mm2, DESCALE_P2
+
+ movq mm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP]
+
+ packssdw mm4, mm3 ; mm4=data3=(03 13 23 33)
+ packssdw mm1, mm2 ; mm1=data4=(04 14 24 34)
+
+ movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36)
+ movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37)
+
+ packsswb mm6, mm1 ; mm6=(02 12 22 32 04 14 24 34)
+ packsswb mm4, mm7 ; mm4=(03 13 23 33 05 15 25 35)
+
+ paddb mm0, mm5
+ paddb mm3, mm5
+ paddb mm6, mm5
+ paddb mm4, mm5
+
+ movq mm2, mm0 ; transpose coefficients(phase 1)
+ punpcklbw mm0, mm3 ; mm0=(00 01 10 11 20 21 30 31)
+ punpckhbw mm2, mm3 ; mm2=(06 07 16 17 26 27 36 37)
+ movq mm1, mm6 ; transpose coefficients(phase 1)
+ punpcklbw mm6, mm4 ; mm6=(02 03 12 13 22 23 32 33)
+ punpckhbw mm1, mm4 ; mm1=(04 05 14 15 24 25 34 35)
+
+ movq mm7, mm0 ; transpose coefficients(phase 2)
+ punpcklwd mm0, mm6 ; mm0=(00 01 02 03 10 11 12 13)
+ punpckhwd mm7, mm6 ; mm7=(20 21 22 23 30 31 32 33)
+ movq mm5, mm1 ; transpose coefficients(phase 2)
+ punpcklwd mm1, mm2 ; mm1=(04 05 06 07 14 15 16 17)
+ punpckhwd mm5, mm2 ; mm5=(24 25 26 27 34 35 36 37)
+
+ movq mm3, mm0 ; transpose coefficients(phase 3)
+ punpckldq mm0, mm1 ; mm0=(00 01 02 03 04 05 06 07)
+ punpckhdq mm3, mm1 ; mm3=(10 11 12 13 14 15 16 17)
+ movq mm4, mm7 ; transpose coefficients(phase 3)
+ punpckldq mm7, mm5 ; mm7=(20 21 22 23 24 25 26 27)
+ punpckhdq mm4, mm5 ; mm4=(30 31 32 33 34 35 36 37)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_JCOEF ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctint-sse2.asm b/media/libjpeg/simd/i386/jidctint-sse2.asm
new file mode 100644
index 0000000000..43e320189b
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-sse2.asm
@@ -0,0 +1,858 @@
+;
+; jidctint.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 12
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm5, PASS1_BITS
+
+ movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ jmp near .column_end
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm4, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm4, xmm3 ; xmm3=in6=z3
+ punpckhwd xmm5, xmm3
+ movdqa xmm1, xmm4
+ movdqa xmm3, xmm5
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
+
+ movdqa xmm6, xmm0
+ paddw xmm0, xmm2 ; xmm0=in0+in4
+ psubw xmm6, xmm2 ; xmm6=in0-in4
+
+ pxor xmm7, xmm7
+ pxor xmm2, xmm2
+ punpcklwd xmm7, xmm0 ; xmm7=tmp0L
+ punpckhwd xmm2, xmm0 ; xmm2=tmp0H
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+ psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm4 ; xmm7=tmp10L
+ psubd xmm0, xmm4 ; xmm0=tmp13L
+ movdqa xmm4, xmm2
+ paddd xmm2, xmm5 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm7, xmm7
+ punpcklwd xmm5, xmm6 ; xmm5=tmp1L
+ punpckhwd xmm7, xmm6 ; xmm7=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+ movdqa xmm2, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm2, xmm1 ; xmm2=tmp12L
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm3 ; xmm7=tmp11H
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm5, xmm6
+ movdqa xmm7, xmm4
+ paddw xmm5, xmm3 ; xmm5=z3
+ paddw xmm7, xmm1 ; xmm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm5
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm0, xmm7
+ movdqa xmm5, xmm2
+ movdqa xmm7, xmm0
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm3
+ punpcklwd xmm2, xmm4
+ punpckhwd xmm0, xmm4
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm0
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
+
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
+ paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
+ paddd xmm3, xmm5 ; xmm3=tmp3L
+ paddd xmm4, xmm7 ; xmm4=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
+
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm1
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm0, xmm6
+ movdqa xmm1, xmm2
+ movdqa xmm6, xmm0
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
+
+ paddd xmm2, xmm5 ; xmm2=tmp1L
+ paddd xmm0, xmm7 ; xmm0=tmp1H
+ paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm7
+ paddd xmm5, xmm3 ; xmm5=data0L
+ paddd xmm7, xmm4 ; xmm7=data0H
+ psubd xmm2, xmm3 ; xmm2=data7L
+ psubd xmm0, xmm4 ; xmm0=data7H
+
+ movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
+
+ paddd xmm5, xmm3
+ paddd xmm7, xmm3
+ psrad xmm5, DESCALE_P1
+ psrad xmm7, DESCALE_P1
+ paddd xmm2, xmm3
+ paddd xmm0, xmm3
+ psrad xmm2, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+ movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
+ movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
+
+ movdqa xmm7, xmm4
+ movdqa xmm0, xmm3
+ paddd xmm4, xmm1 ; xmm4=data1L
+ paddd xmm3, xmm6 ; xmm3=data1H
+ psubd xmm7, xmm1 ; xmm7=data6L
+ psubd xmm0, xmm6 ; xmm0=data6H
+
+ movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
+
+ paddd xmm4, xmm1
+ paddd xmm3, xmm1
+ psrad xmm4, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+ paddd xmm7, xmm1
+ paddd xmm0, xmm1
+ psrad xmm7, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
+ movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
+ movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
+ movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm0
+ paddd xmm3, xmm4 ; xmm3=data2L
+ paddd xmm0, xmm2 ; xmm0=data2H
+ psubd xmm5, xmm4 ; xmm5=data5L
+ psubd xmm6, xmm2 ; xmm6=data5H
+
+ movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
+
+ paddd xmm3, xmm7
+ paddd xmm0, xmm7
+ psrad xmm3, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+ paddd xmm5, xmm7
+ paddd xmm6, xmm7
+ psrad xmm5, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
+ packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
+ movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
+ movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
+ movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
+
+ movdqa xmm0, xmm1
+ movdqa xmm6, xmm4
+ paddd xmm1, xmm2 ; xmm1=data3L
+ paddd xmm4, xmm7 ; xmm4=data3H
+ psubd xmm0, xmm2 ; xmm0=data4L
+ psubd xmm6, xmm7 ; xmm6=data4H
+
+ movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
+
+ paddd xmm1, xmm2
+ paddd xmm4, xmm2
+ psrad xmm1, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm0, xmm2
+ paddd xmm6, xmm2
+ psrad xmm0, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
+ packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
+ movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
+
+ movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
+ punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
+ movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
+
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm6, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm6, xmm2 ; xmm2=in6=z3
+ punpckhwd xmm5, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm2, xmm5
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
+
+ movdqa xmm3, xmm7
+ paddw xmm7, xmm0 ; xmm7=in0+in4
+ psubw xmm3, xmm0 ; xmm3=in0-in4
+
+ pxor xmm4, xmm4
+ pxor xmm0, xmm0
+ punpcklwd xmm4, xmm7 ; xmm4=tmp0L
+ punpckhwd xmm0, xmm7 ; xmm0=tmp0H
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+ psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm6 ; xmm4=tmp10L
+ psubd xmm7, xmm6 ; xmm7=tmp13L
+ movdqa xmm6, xmm0
+ paddd xmm0, xmm5 ; xmm0=tmp10H
+ psubd xmm6, xmm5 ; xmm6=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm4, xmm4
+ punpcklwd xmm5, xmm3 ; xmm5=tmp1L
+ punpckhwd xmm4, xmm3 ; xmm4=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+ movdqa xmm0, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm0, xmm1 ; xmm0=tmp12L
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm2 ; xmm4=tmp11H
+ psubd xmm7, xmm2 ; xmm7=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
+ movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
+ movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
+ movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
+
+ movdqa xmm5, xmm6
+ movdqa xmm4, xmm3
+ paddw xmm5, xmm1 ; xmm5=z3
+ paddw xmm4, xmm2 ; xmm4=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm5
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm7, xmm4
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm7
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm0, xmm1
+ movdqa xmm7, xmm1
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm7
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
+
+ paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
+ paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
+ paddd xmm1, xmm5 ; xmm1=tmp3L
+ paddd xmm3, xmm4 ; xmm3=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
+
+ movdqa xmm0, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm7, xmm6
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm7
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
+
+ paddd xmm0, xmm5 ; xmm0=tmp1L
+ paddd xmm7, xmm4 ; xmm7=tmp1H
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm4
+ paddd xmm5, xmm1 ; xmm5=data0L
+ paddd xmm4, xmm3 ; xmm4=data0H
+ psubd xmm0, xmm1 ; xmm0=data7L
+ psubd xmm7, xmm3 ; xmm7=data7H
+
+ movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
+
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrad xmm5, DESCALE_P2
+ psrad xmm4, DESCALE_P2
+ paddd xmm0, xmm1
+ paddd xmm7, xmm1
+ psrad xmm0, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
+ packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
+ movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
+
+ movdqa xmm4, xmm3
+ movdqa xmm7, xmm1
+ paddd xmm3, xmm2 ; xmm3=data1L
+ paddd xmm1, xmm6 ; xmm1=data1H
+ psubd xmm4, xmm2 ; xmm4=data6L
+ psubd xmm7, xmm6 ; xmm7=data6H
+
+ movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
+
+ paddd xmm3, xmm2
+ paddd xmm1, xmm2
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm4, xmm2
+ paddd xmm7, xmm2
+ psrad xmm4, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+ packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
+ movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
+ movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm4, xmm6
+ movdqa xmm0, xmm2
+ paddd xmm6, xmm1 ; xmm6=data2L
+ paddd xmm2, xmm7 ; xmm2=data2H
+ psubd xmm4, xmm1 ; xmm4=data5L
+ psubd xmm0, xmm7 ; xmm0=data5H
+
+ movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
+
+ paddd xmm6, xmm5
+ paddd xmm2, xmm5
+ psrad xmm6, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm4, xmm5
+ paddd xmm0, xmm5
+ psrad xmm4, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
+ packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+ movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
+ movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
+ movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
+ movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm1
+ paddd xmm3, xmm7 ; xmm3=data3L
+ paddd xmm1, xmm5 ; xmm1=data3H
+ psubd xmm2, xmm7 ; xmm2=data4L
+ psubd xmm0, xmm5 ; xmm0=data4H
+
+ movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
+
+ paddd xmm3, xmm7
+ paddd xmm1, xmm7
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm2, xmm7
+ paddd xmm0, xmm7
+ psrad xmm2, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
+
+ packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
+ packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm7, xmm5
+ paddb xmm1, xmm5
+ paddb xmm6, xmm5
+ paddb xmm3, xmm5
+
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
+ punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+ mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctred-mmx.asm b/media/libjpeg/simd/i386/jidctred-mmx.asm
new file mode 100644
index 0000000000..e2307e1cb6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-mmx.asm
@@ -0,0 +1,704 @@
+;
+; jidctred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076 times 2 dw F_1_847, -F_0_765
+PW_F256_F089 times 2 dw F_2_562, F_0_899
+PW_F106_MF217 times 2 dw F_1_061, -F_2_172
+PW_MF060_MF050 times 2 dw -F_0_601, -F_0_509
+PW_F145_MF021 times 2 dw F_1_451, -F_0_211
+PW_F362_MF127 times 2 dw F_3_624, -F_1_272
+PW_F085_MF072 times 2 dw F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+ ; JCOEF workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; JCOEF *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm0, mm1
+ packsswb mm0, mm0
+ movd eax, mm0
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw mm0, PASS1_BITS
+
+ movq mm2, mm0 ; mm0=in0=(00 01 02 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
+
+ movq mm1, mm0
+ punpckldq mm0, mm0 ; mm0=(00 00 00 00)
+ punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
+ movq mm3, mm2
+ punpckldq mm2, mm2 ; mm2=(02 02 02 02)
+ punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movq mm4, mm0
+ movq mm5, mm0
+ punpcklwd mm4, mm1
+ punpckhwd mm5, mm1
+ movq mm0, mm4
+ movq mm1, mm5
+ pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
+ pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
+ pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
+ pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
+
+ movq mm6, mm2
+ movq mm7, mm2
+ punpcklwd mm6, mm3
+ punpckhwd mm7, mm3
+ movq mm2, mm6
+ movq mm3, mm7
+ pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
+ pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
+ pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
+
+ paddd mm6, mm4 ; mm6=tmp2L
+ paddd mm7, mm5 ; mm7=tmp2H
+ paddd mm2, mm0 ; mm2=tmp0L
+ paddd mm3, mm1 ; mm3=tmp0H
+
+ movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
+ movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklwd mm1, mm4 ; mm1=tmp0L
+ punpckhwd mm2, mm4 ; mm2=tmp0H
+ psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+ psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+ movq mm3, mm5 ; mm5=in2=z2
+ punpcklwd mm5, mm0 ; mm0=in6=z3
+ punpckhwd mm3, mm0
+ pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
+ pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
+
+ movq mm4, mm1
+ movq mm0, mm2
+ paddd mm1, mm5 ; mm1=tmp10L
+ paddd mm2, mm3 ; mm2=tmp10H
+ psubd mm4, mm5 ; mm4=tmp12L
+ psubd mm0, mm3 ; mm0=tmp12H
+
+ ; -- Final output stage
+
+ movq mm5, mm1
+ movq mm3, mm2
+ paddd mm1, mm6 ; mm1=data0L
+ paddd mm2, mm7 ; mm2=data0H
+ psubd mm5, mm6 ; mm5=data3L
+ psubd mm3, mm7 ; mm3=data3H
+
+ movq mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4]
+
+ paddd mm1, mm6
+ paddd mm2, mm6
+ psrad mm1, DESCALE_P1_4
+ psrad mm2, DESCALE_P1_4
+ paddd mm5, mm6
+ paddd mm3, mm6
+ psrad mm5, DESCALE_P1_4
+ psrad mm3, DESCALE_P1_4
+
+ packssdw mm1, mm2 ; mm1=data0=(00 01 02 03)
+ packssdw mm5, mm3 ; mm5=data3=(30 31 32 33)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
+
+ movq mm2, mm4
+ movq mm3, mm0
+ paddd mm4, mm7 ; mm4=data1L
+ paddd mm0, mm6 ; mm0=data1H
+ psubd mm2, mm7 ; mm2=data2L
+ psubd mm3, mm6 ; mm3=data2H
+
+ movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4]
+
+ paddd mm4, mm7
+ paddd mm0, mm7
+ psrad mm4, DESCALE_P1_4
+ psrad mm0, DESCALE_P1_4
+ paddd mm2, mm7
+ paddd mm3, mm7
+ psrad mm2, DESCALE_P1_4
+ psrad mm3, DESCALE_P1_4
+
+ packssdw mm4, mm0 ; mm4=data1=(10 11 12 13)
+ packssdw mm2, mm3 ; mm2=data2=(20 21 22 23)
+
+ movq mm6, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm4 ; mm1=(00 10 01 11)
+ punpckhwd mm6, mm4 ; mm6=(02 12 03 13)
+ movq mm7, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm5 ; mm2=(20 30 21 31)
+ punpckhwd mm7, mm5 ; mm7=(22 32 23 33)
+
+ movq mm0, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm2 ; mm1=(00 10 20 30)
+ punpckhdq mm0, mm2 ; mm0=(01 11 21 31)
+ movq mm3, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm7 ; mm6=(02 12 22 32)
+ punpckhdq mm3, mm7 ; mm3=(03 13 23 33)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
+ add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; JCOEF *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ movq mm4, mm0
+ movq mm5, mm0
+ punpcklwd mm4, mm1
+ punpckhwd mm5, mm1
+ movq mm0, mm4
+ movq mm1, mm5
+ pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
+ pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
+ pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
+ pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
+
+ movq mm6, mm2
+ movq mm7, mm2
+ punpcklwd mm6, mm3
+ punpckhwd mm7, mm3
+ movq mm2, mm6
+ movq mm3, mm7
+ pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
+ pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
+ pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
+
+ paddd mm6, mm4 ; mm6=tmp2L
+ paddd mm7, mm5 ; mm7=tmp2H
+ paddd mm2, mm0 ; mm2=tmp0L
+ paddd mm3, mm1 ; mm3=tmp0H
+
+ movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
+ movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklwd mm1, mm4 ; mm1=tmp0L
+ punpckhwd mm2, mm4 ; mm2=tmp0H
+ psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+ psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+ movq mm3, mm5 ; mm5=in2=z2
+ punpcklwd mm5, mm0 ; mm0=in6=z3
+ punpckhwd mm3, mm0
+ pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
+ pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
+
+ movq mm4, mm1
+ movq mm0, mm2
+ paddd mm1, mm5 ; mm1=tmp10L
+ paddd mm2, mm3 ; mm2=tmp10H
+ psubd mm4, mm5 ; mm4=tmp12L
+ psubd mm0, mm3 ; mm0=tmp12H
+
+ ; -- Final output stage
+
+ movq mm5, mm1
+ movq mm3, mm2
+ paddd mm1, mm6 ; mm1=data0L
+ paddd mm2, mm7 ; mm2=data0H
+ psubd mm5, mm6 ; mm5=data3L
+ psubd mm3, mm7 ; mm3=data3H
+
+ movq mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4]
+
+ paddd mm1, mm6
+ paddd mm2, mm6
+ psrad mm1, DESCALE_P2_4
+ psrad mm2, DESCALE_P2_4
+ paddd mm5, mm6
+ paddd mm3, mm6
+ psrad mm5, DESCALE_P2_4
+ psrad mm3, DESCALE_P2_4
+
+ packssdw mm1, mm2 ; mm1=data0=(00 10 20 30)
+ packssdw mm5, mm3 ; mm5=data3=(03 13 23 33)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
+
+ movq mm2, mm4
+ movq mm3, mm0
+ paddd mm4, mm7 ; mm4=data1L
+ paddd mm0, mm6 ; mm0=data1H
+ psubd mm2, mm7 ; mm2=data2L
+ psubd mm3, mm6 ; mm3=data2H
+
+ movq mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4]
+
+ paddd mm4, mm7
+ paddd mm0, mm7
+ psrad mm4, DESCALE_P2_4
+ psrad mm0, DESCALE_P2_4
+ paddd mm2, mm7
+ paddd mm3, mm7
+ psrad mm2, DESCALE_P2_4
+ psrad mm3, DESCALE_P2_4
+
+ packssdw mm4, mm0 ; mm4=data1=(01 11 21 31)
+ packssdw mm2, mm3 ; mm2=data2=(02 12 22 32)
+
+ movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
+
+ packsswb mm1, mm2 ; mm1=(00 10 20 30 02 12 22 32)
+ packsswb mm4, mm5 ; mm4=(01 11 21 31 03 13 23 33)
+ paddb mm1, mm6
+ paddb mm4, mm6
+
+ movq mm7, mm1 ; transpose coefficients(phase 1)
+ punpcklbw mm1, mm4 ; mm1=(00 01 10 11 20 21 30 31)
+ punpckhbw mm7, mm4 ; mm7=(02 03 12 13 22 23 32 33)
+
+ movq mm0, mm1 ; transpose coefficients(phase 2)
+ punpcklwd mm1, mm7 ; mm1=(00 01 02 03 10 11 12 13)
+ punpckhwd mm0, mm7 ; mm0=(20 21 22 23 30 31 32 33)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
+ movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+ psrlq mm1, 4*BYTE_BIT
+ psrlq mm0, 4*BYTE_BIT
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
+ movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+ mov edx, POINTER [dct_table(ebp)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+ ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+ pcmpeqd mm7, mm7
+ pslld mm7, WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+ movq mm4, mm0 ; mm4=(10 11 ** 13)
+ movq mm5, mm2 ; mm5=(50 51 ** 53)
+ punpcklwd mm4, mm1 ; mm4=(10 30 11 31)
+ punpcklwd mm5, mm3 ; mm5=(50 70 51 71)
+ pmaddwd mm4, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+ psrld mm0, WORD_BIT ; mm0=(11 -- 13 --)
+ pand mm1, mm7 ; mm1=(-- 31 -- 33)
+ psrld mm2, WORD_BIT ; mm2=(51 -- 53 --)
+ pand mm3, mm7 ; mm3=(-- 71 -- 73)
+ por mm0, mm1 ; mm0=(11 31 13 33)
+ por mm2, mm3 ; mm2=(51 71 53 73)
+ pmaddwd mm0, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd mm4, mm5 ; mm4=tmp0[col0 col1]
+
+ movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+ pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+ pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+ ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+ psrld mm6, WORD_BIT ; mm6=(15 -- 17 --)
+ pand mm1, mm7 ; mm1=(-- 35 -- 37)
+ psrld mm3, WORD_BIT ; mm3=(55 -- 57 --)
+ pand mm5, mm7 ; mm5=(-- 75 -- 77)
+ por mm6, mm1 ; mm6=(15 35 17 37)
+ por mm3, mm5 ; mm3=(55 75 57 77)
+ pmaddwd mm6, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm3, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd mm0, mm2 ; mm0=tmp0[col1 col3]
+ paddd mm6, mm3 ; mm6=tmp0[col5 col7]
+
+ ; -- Even part
+
+ movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+ pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+ movq mm2, mm1 ; mm2=(00 01 ** 03)
+ pslld mm1, WORD_BIT ; mm1=(-- 00 -- **)
+ psrad mm1, (WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****]
+
+ pand mm2, mm7 ; mm2=(-- 01 -- 03)
+ pand mm5, mm7 ; mm5=(-- 05 -- 07)
+ psrad mm2, (WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3]
+ psrad mm5, (WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7]
+
+ ; -- Final output stage
+
+ movq mm3, mm1
+ paddd mm1, mm4 ; mm1=data0[col0 ****]=(A0 **)
+ psubd mm3, mm4 ; mm3=data1[col0 ****]=(B0 **)
+ punpckldq mm1, mm3 ; mm1=(A0 B0)
+
+ movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2]
+
+ movq mm4, mm2
+ movq mm3, mm5
+ paddd mm2, mm0 ; mm2=data0[col1 col3]=(A1 A3)
+ paddd mm5, mm6 ; mm5=data0[col5 col7]=(A5 A7)
+ psubd mm4, mm0 ; mm4=data1[col1 col3]=(B1 B3)
+ psubd mm3, mm6 ; mm3=data1[col5 col7]=(B5 B7)
+
+ paddd mm1, mm7
+ psrad mm1, DESCALE_P1_2
+
+ paddd mm2, mm7
+ paddd mm5, mm7
+ psrad mm2, DESCALE_P1_2
+ psrad mm5, DESCALE_P1_2
+ paddd mm4, mm7
+ paddd mm3, mm7
+ psrad mm4, DESCALE_P1_2
+ psrad mm3, DESCALE_P1_2
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(ebp)]
+
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
+
+ ; -- Odd part
+
+ packssdw mm2, mm4 ; mm2=(A1 A3 B1 B3)
+ packssdw mm5, mm3 ; mm5=(A5 A7 B5 B7)
+ pmaddwd mm2, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd mm2, mm5 ; mm2=tmp0[row0 row1]
+
+ ; -- Even part
+
+ pslld mm1, (CONST_BITS+2) ; mm1=tmp10[row0 row1]
+
+ ; -- Final output stage
+
+ movq mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2]
+
+ movq mm6, mm1
+ paddd mm1, mm2 ; mm1=data0[row0 row1]=(C0 C1)
+ psubd mm6, mm2 ; mm6=data1[row0 row1]=(D0 D1)
+
+ paddd mm1, mm0
+ paddd mm6, mm0
+ psrad mm1, DESCALE_P2_2
+ psrad mm6, DESCALE_P2_2
+
+ movq mm7, mm1 ; transpose coefficients
+ punpckldq mm1, mm6 ; mm1=(C0 D0)
+ punpckhdq mm7, mm6 ; mm7=(C1 D1)
+
+ packssdw mm1, mm7 ; mm1=(C0 D0 C1 D1)
+ packsswb mm1, mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+ paddb mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ movd ecx, mm1
+ movd ebx, mm1 ; ebx=(C0 D0 C1 D1)
+ shr ecx, 2*BYTE_BIT ; ecx=(C1 D1 -- --)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov word [edx+eax*SIZEOF_JSAMPLE], bx
+ mov word [esi+eax*SIZEOF_JSAMPLE], cx
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctred-sse2.asm b/media/libjpeg/simd/i386/jidctred-sse2.asm
new file mode 100644
index 0000000000..6e56494e97
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-sse2.asm
@@ -0,0 +1,592 @@
+;
+; jidctred.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076 times 4 dw F_1_847, -F_0_765
+PW_F256_F089 times 4 dw F_2_562, F_0_899
+PW_F106_MF217 times 4 dw F_1_061, -F_2_172
+PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509
+PW_F145_MF021 times 4 dw F_1_451, -F_0_211
+PW_F362_MF127 times 4 dw F_3_624, -F_1_272
+PW_F085_MF072 times 4 dw F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm0, xmm1
+ packsswb xmm0, xmm0
+ packsswb xmm0, xmm0
+ movd eax, xmm0
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm0, PASS1_BITS
+
+ movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+ pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+ pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+ pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+ jmp near .column_end
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm0
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
+
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm6, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
+ pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
+
+ paddd xmm6, xmm4 ; xmm6=tmp2L
+ paddd xmm7, xmm5 ; xmm7=tmp2H
+ paddd xmm2, xmm0 ; xmm2=tmp0L
+ paddd xmm3, xmm1 ; xmm3=tmp0H
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ punpcklwd xmm1, xmm4 ; xmm1=tmp0L
+ punpckhwd xmm2, xmm4 ; xmm2=tmp0H
+ psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+ psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+ movdqa xmm3, xmm5 ; xmm5=in2=z2
+ punpcklwd xmm5, xmm0 ; xmm0=in6=z3
+ punpckhwd xmm3, xmm0
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
+
+ movdqa xmm4, xmm1
+ movdqa xmm0, xmm2
+ paddd xmm1, xmm5 ; xmm1=tmp10L
+ paddd xmm2, xmm3 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp12L
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ ; -- Final output stage
+
+ movdqa xmm5, xmm1
+ movdqa xmm3, xmm2
+ paddd xmm1, xmm6 ; xmm1=data0L
+ paddd xmm2, xmm7 ; xmm2=data0H
+ psubd xmm5, xmm6 ; xmm5=data3L
+ psubd xmm3, xmm7 ; xmm3=data3H
+
+ movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
+
+ paddd xmm1, xmm6
+ paddd xmm2, xmm6
+ psrad xmm1, DESCALE_P1_4
+ psrad xmm2, DESCALE_P1_4
+ paddd xmm5, xmm6
+ paddd xmm3, xmm6
+ psrad xmm5, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
+
+ movdqa xmm2, xmm4
+ movdqa xmm3, xmm0
+ paddd xmm4, xmm7 ; xmm4=data1L
+ paddd xmm0, xmm6 ; xmm0=data1H
+ psubd xmm2, xmm7 ; xmm2=data2L
+ psubd xmm3, xmm6 ; xmm3=data2H
+
+ movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
+
+ paddd xmm4, xmm7
+ paddd xmm0, xmm7
+ psrad xmm4, DESCALE_P1_4
+ psrad xmm0, DESCALE_P1_4
+ paddd xmm2, xmm7
+ paddd xmm3, xmm7
+ psrad xmm2, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ pxor xmm4, xmm4
+ punpcklwd xmm4, xmm1 ; xmm4=tmp0
+ psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+ ; -- Odd part
+
+ punpckhwd xmm1, xmm0
+ punpckhwd xmm6, xmm3
+ movdqa xmm5, xmm1
+ movdqa xmm2, xmm6
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
+
+ paddd xmm6, xmm1 ; xmm6=tmp2
+ paddd xmm2, xmm5 ; xmm2=tmp0
+
+ ; -- Even part
+
+ punpcklwd xmm0, xmm3
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm0 ; xmm4=tmp10
+ psubd xmm7, xmm0 ; xmm7=tmp12
+
+ ; -- Final output stage
+
+ movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
+
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm7
+ paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
+ paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
+ psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
+ psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
+
+ paddd xmm4, xmm1
+ paddd xmm7, xmm1
+ psrad xmm4, DESCALE_P2_4
+ psrad xmm7, DESCALE_P2_4
+ paddd xmm5, xmm1
+ paddd xmm3, xmm1
+ psrad xmm5, DESCALE_P2_4
+ psrad xmm3, DESCALE_P2_4
+
+ packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
+ packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
+
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
+ punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
+
+ packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+ paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+ pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+ pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+ mov edx, POINTER [dct_table(ebp)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+ ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+ pcmpeqd xmm7, xmm7
+ pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+ movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
+ movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
+ punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
+ punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+ psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
+ pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+ psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
+ pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+ por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
+ por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
+ paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
+
+ ; -- Even part
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+ movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
+ pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
+ pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+ psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+ psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+ ; -- Final output stage
+
+ movdqa xmm3, xmm6
+ movdqa xmm5, xmm1
+ paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+ paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+ psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+ psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+ movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
+
+ punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
+
+ movdqa xmm7, xmm1
+ punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
+ punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
+
+ paddd xmm6, xmm2
+ psrad xmm6, DESCALE_P1_2
+
+ paddd xmm1, xmm2
+ paddd xmm7, xmm2
+ psrad xmm1, DESCALE_P1_2
+ psrad xmm7, DESCALE_P1_2
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(ebp)]
+
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
+
+ ; -- Odd part
+
+ packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+ packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
+
+ ; -- Even part
+
+ pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
+
+ ; -- Final output stage
+
+ movdqa xmm4, xmm6
+ paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+ psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+ punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
+
+ paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
+ psrad xmm6, DESCALE_P2_2
+
+ packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+ packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+ paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
+ pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov word [edx+eax*SIZEOF_JSAMPLE], bx
+ mov word [esi+eax*SIZEOF_JSAMPLE], cx
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquant-3dn.asm b/media/libjpeg/simd/i386/jquant-3dn.asm
new file mode 100644
index 0000000000..5cb60caa94
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-3dn.asm
@@ -0,0 +1,230 @@
+;
+; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw mm7, mm7
+ psllw mm7, 7
+ packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb mm0, mm7 ; mm0=(01234567)
+ psubb mm1, mm7 ; mm1=(89ABCDEF)
+
+ punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
+ punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
+ punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
+ punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
+
+ punpcklwd mm4, mm2 ; mm4=(***0***1)
+ punpckhwd mm2, mm2 ; mm2=(***2***3)
+ punpcklwd mm5, mm0 ; mm5=(***4***5)
+ punpckhwd mm0, mm0 ; mm0=(***6***7)
+
+ psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
+ psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
+ pi2fd mm4, mm4
+ pi2fd mm2, mm2
+ psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
+ psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
+ pi2fd mm5, mm5
+ pi2fd mm0, mm0
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+ movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+ movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+ punpcklwd mm6, mm3 ; mm6=(***8***9)
+ punpckhwd mm3, mm3 ; mm3=(***A***B)
+ punpcklwd mm4, mm1 ; mm4=(***C***D)
+ punpckhwd mm1, mm1 ; mm1=(***E***F)
+
+ psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
+ psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
+ pi2fd mm6, mm6
+ pi2fd mm3, mm3
+ psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
+ psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
+ pi2fd mm4, mm4
+ pi2fd mm1, mm1
+
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+ movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .convloop
+
+ femms ; empty MMX/3DNow! state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+; FAST_FLOAT *workspace);
+;
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; FAST_FLOAT *divisors
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic)
+ movd mm7, eax
+ punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F}
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16, 7
+.quantloop:
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+ pfadd mm0, mm7 ; mm0=(00 ** 01 **)
+ pfadd mm1, mm7 ; mm1=(02 ** 03 **)
+ pfadd mm2, mm7 ; mm0=(04 ** 05 **)
+ pfadd mm3, mm7 ; mm1=(06 ** 07 **)
+
+ movq mm4, mm0
+ punpcklwd mm0, mm1 ; mm0=(00 02 ** **)
+ punpckhwd mm4, mm1 ; mm4=(01 03 ** **)
+ movq mm5, mm2
+ punpcklwd mm2, mm3 ; mm2=(04 06 ** **)
+ punpckhwd mm5, mm3 ; mm5=(05 07 ** **)
+
+ punpcklwd mm0, mm4 ; mm0=(00 01 02 03)
+ punpcklwd mm2, mm5 ; mm2=(04 05 06 07)
+
+ movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+ movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+ pfadd mm6, mm7 ; mm0=(10 ** 11 **)
+ pfadd mm1, mm7 ; mm4=(12 ** 13 **)
+ pfadd mm3, mm7 ; mm0=(14 ** 15 **)
+ pfadd mm4, mm7 ; mm4=(16 ** 17 **)
+
+ movq mm5, mm6
+ punpcklwd mm6, mm1 ; mm6=(10 12 ** **)
+ punpckhwd mm5, mm1 ; mm5=(11 13 ** **)
+ movq mm1, mm3
+ punpcklwd mm3, mm4 ; mm3=(14 16 ** **)
+ punpckhwd mm1, mm4 ; mm1=(15 17 ** **)
+
+ punpcklwd mm6, mm5 ; mm6=(10 11 12 13)
+ punpcklwd mm3, mm1 ; mm3=(14 15 16 17)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz near .quantloop
+
+ femms ; empty MMX/3DNow! state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquant-mmx.asm b/media/libjpeg/simd/i386/jquant-mmx.asm
new file mode 100644
index 0000000000..61305c625d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-mmx.asm
@@ -0,0 +1,276 @@
+;
+; jquant.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pxor mm6, mm6 ; mm6=(all 0's)
+ pcmpeqw mm7, mm7
+ psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
+ movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
+ movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
+
+ movq mm4, mm0
+ punpcklbw mm0, mm6 ; mm0=(0123)
+ punpckhbw mm4, mm6 ; mm4=(4567)
+ movq mm5, mm1
+ punpcklbw mm1, mm6 ; mm1=(89AB)
+ punpckhbw mm5, mm6 ; mm5=(CDEF)
+
+ paddw mm0, mm7
+ paddw mm4, mm7
+ paddw mm1, mm7
+ paddw mm5, mm7
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+ movq mm0, mm2
+ punpcklbw mm2, mm6 ; mm2=(GHIJ)
+ punpckhbw mm0, mm6 ; mm0=(KLMN)
+ movq mm4, mm3
+ punpcklbw mm3, mm6 ; mm3=(OPQR)
+ punpckhbw mm4, mm6 ; mm4=(STUV)
+
+ paddw mm2, mm7
+ paddw mm0, mm7
+ paddw mm3, mm7
+ paddw mm4, mm7
+
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+ add esi, byte 4*SIZEOF_JSAMPROW
+ add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz short .convloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SHIFT(m, n, b) \
+ MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; DCTELEM *divisors
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov ah, 2
+ alignx 16, 7
+.quantloop1:
+ mov al, DCTSIZE2/8/2
+ alignx 16, 7
+.quantloop2:
+ movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+ movq mm0, mm2
+ movq mm1, mm3
+
+ psraw mm2, (WORD_BIT-1) ; -1 if value < 0, 0 otherwise
+ psraw mm3, (WORD_BIT-1)
+
+ pxor mm0, mm2 ; val = -val
+ pxor mm1, mm3
+ psubw mm0, mm2
+ psubw mm1, mm3
+
+ ;
+ ; MMX is an annoyingly crappy instruction set. It has two
+ ; misfeatures that are causing problems here:
+ ;
+ ; - All multiplications are signed.
+ ;
+ ; - The second operand for the shifts is not treated as packed.
+ ;
+ ;
+ ; We work around the first problem by implementing this algorithm:
+ ;
+ ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+ ; {
+ ; enum { SHORT_BIT = 16 };
+ ; signed short sx = (signed short)x;
+ ; signed short sy = (signed short)y;
+ ; signed long sz;
+ ;
+ ; sz = (long)sx * (long)sy; /* signed multiply */
+ ;
+ ; if (sx < 0) sz += (long)sy << SHORT_BIT;
+ ; if (sy < 0) sz += (long)sx << SHORT_BIT;
+ ;
+ ; return (unsigned long)sz;
+ ; }
+ ;
+ ; (note that a negative sx adds _sy_ and vice versa)
+ ;
+ ; For the second problem, we replace the shift by a multiplication.
+ ; Unfortunately that means we have to deal with the signed issue again.
+ ;
+
+ paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
+ paddw mm1, MMWORD [CORRECTION(0,1,edx)]
+
+ movq mm4, mm0 ; store current value for later
+ movq mm5, mm1
+ pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
+ pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
+ paddw mm0, mm4 ; reciprocal is always negative (MSB=1),
+ paddw mm1, mm5 ; so we always need to add the initial value
+ ; (input value is never negative as we
+ ; inverted it at the start of this routine)
+
+ ; here it gets a bit tricky as both scale
+ ; and mm0/mm1 can be negative
+ movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
+ movq mm7, MMWORD [SCALE(0,1,edx)]
+ movq mm4, mm0
+ movq mm5, mm1
+ pmulhw mm0, mm6
+ pmulhw mm1, mm7
+
+ psraw mm6, (WORD_BIT-1) ; determine if scale is negative
+ psraw mm7, (WORD_BIT-1)
+
+ pand mm6, mm4 ; and add input if it is
+ pand mm7, mm5
+ paddw mm0, mm6
+ paddw mm1, mm7
+
+ psraw mm4, (WORD_BIT-1) ; then check if negative input
+ psraw mm5, (WORD_BIT-1)
+
+ pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
+ pand mm5, MMWORD [SCALE(0,1,edx)]
+ paddw mm0, mm4
+ paddw mm1, mm5
+
+ pxor mm0, mm2 ; val = -val
+ pxor mm1, mm3
+ psubw mm0, mm2
+ psubw mm1, mm3
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+ add esi, byte 8*SIZEOF_DCTELEM
+ add edx, byte 8*SIZEOF_DCTELEM
+ add edi, byte 8*SIZEOF_JCOEF
+ dec al
+ jnz near .quantloop2
+ dec ah
+ jnz near .quantloop1 ; to avoid branch misprediction
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquant-sse.asm b/media/libjpeg/simd/i386/jquant-sse.asm
new file mode 100644
index 0000000000..218adc976f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-sse.asm
@@ -0,0 +1,208 @@
+;
+; jquant.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw mm7, mm7
+ psllw mm7, 7
+ packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb mm0, mm7 ; mm0=(01234567)
+ psubb mm1, mm7 ; mm1=(89ABCDEF)
+
+ punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
+ punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
+ punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
+ punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
+
+ punpcklwd mm4, mm2 ; mm4=(***0***1)
+ punpckhwd mm2, mm2 ; mm2=(***2***3)
+ punpcklwd mm5, mm0 ; mm5=(***4***5)
+ punpckhwd mm0, mm0 ; mm0=(***6***7)
+
+ psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
+ psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
+ cvtpi2ps xmm0, mm4 ; xmm0=(01**)
+ cvtpi2ps xmm1, mm2 ; xmm1=(23**)
+ psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
+ psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
+ cvtpi2ps xmm2, mm5 ; xmm2=(45**)
+ cvtpi2ps xmm3, mm0 ; xmm3=(67**)
+
+ punpcklwd mm6, mm3 ; mm6=(***8***9)
+ punpckhwd mm3, mm3 ; mm3=(***A***B)
+ punpcklwd mm4, mm1 ; mm4=(***C***D)
+ punpckhwd mm1, mm1 ; mm1=(***E***F)
+
+ psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
+ psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
+ cvtpi2ps xmm4, mm6 ; xmm4=(89**)
+ cvtpi2ps xmm5, mm3 ; xmm5=(AB**)
+ psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
+ psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
+ cvtpi2ps xmm6, mm4 ; xmm6=(CD**)
+ cvtpi2ps xmm7, mm1 ; xmm7=(EF**)
+
+ movlhps xmm0, xmm1 ; xmm0=(0123)
+ movlhps xmm2, xmm3 ; xmm2=(4567)
+ movlhps xmm4, xmm5 ; xmm4=(89AB)
+ movlhps xmm6, xmm7 ; xmm6=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .convloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+; FAST_FLOAT *workspace);
+;
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; FAST_FLOAT *divisors
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16, 7
+.quantloop:
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ movhlps xmm4, xmm0
+ movhlps xmm5, xmm1
+
+ cvtps2pi mm0, xmm0
+ cvtps2pi mm1, xmm1
+ cvtps2pi mm4, xmm4
+ cvtps2pi mm5, xmm5
+
+ movhlps xmm6, xmm2
+ movhlps xmm7, xmm3
+
+ cvtps2pi mm2, xmm2
+ cvtps2pi mm3, xmm3
+ cvtps2pi mm6, xmm6
+ cvtps2pi mm7, xmm7
+
+ packssdw mm0, mm4
+ packssdw mm1, mm5
+ packssdw mm2, mm6
+ packssdw mm3, mm7
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz short .quantloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquantf-sse2.asm b/media/libjpeg/simd/i386/jquantf-sse2.asm
new file mode 100644
index 0000000000..a881ab50f9
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquantf-sse2.asm
@@ -0,0 +1,168 @@
+;
+; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7
+ packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb xmm0, xmm7 ; xmm0=(01234567)
+ psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
+
+ punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
+ punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
+
+ punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
+ punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
+ punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
+ punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
+
+ psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
+ psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
+ cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
+ cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
+ psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
+ psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
+ cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
+ cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz short .convloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+; FAST_FLOAT *workspace);
+;
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; FAST_FLOAT *divisors
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16, 7
+.quantloop:
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz short .quantloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquanti-avx2.asm b/media/libjpeg/simd/i386/jquanti-avx2.asm
new file mode 100644
index 0000000000..5ed6bec246
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-avx2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ vinserti128 ymm0, ymm0, xmm1, 1
+ vinserti128 ymm2, ymm2, xmm3, 1
+ vinserti128 ymm4, ymm4, xmm5, 1
+ vinserti128 ymm6, ymm6, xmm7, 1
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+ vpunpcklbw ymm0, ymm0, ymm1
+ vpunpcklbw ymm2, ymm2, ymm1
+ vpunpcklbw ymm4, ymm4, ymm1
+ vpunpcklbw ymm6, ymm6, ymm1
+
+ vpcmpeqw ymm7, ymm7, ymm7
+ vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm7
+ vpaddw ymm4, ymm4, ymm7
+ vpaddw ymm6, ymm6, ymm7
+
+ vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+ vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
+ vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
+ vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
+
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; DCTELEM *divisors
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+
+ vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+ vpabsw ymm0, ymm4
+ vpabsw ymm1, ymm5
+ vpabsw ymm2, ymm6
+ vpabsw ymm3, ymm7
+
+ vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
+ vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)]
+ vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)]
+ vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)]
+ vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
+ vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
+ vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
+ vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
+ vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale
+ vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)]
+ vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)]
+ vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)]
+
+ vpsignw ymm0, ymm0, ymm4
+ vpsignw ymm1, ymm1, ymm5
+ vpsignw ymm2, ymm2, ymm6
+ vpsignw ymm3, ymm3, ymm7
+
+ vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+ vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+ vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+ vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquanti-sse2.asm b/media/libjpeg/simd/i386/jquanti-sse2.asm
new file mode 100644
index 0000000000..0a509408aa
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-sse2.asm
@@ -0,0 +1,201 @@
+;
+; jquanti.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pxor xmm6, xmm6 ; xmm6=(all 0's)
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
+ movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
+
+ punpcklbw xmm0, xmm6 ; xmm0=(01234567)
+ punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
+ punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
+ paddw xmm2, xmm7
+ paddw xmm3, xmm7
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+ add esi, byte 4*SIZEOF_JSAMPROW
+ add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz short .convloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; DCTELEM *divisors
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/32
+ alignx 16, 7
+.quantloop:
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ psraw xmm4, (WORD_BIT-1)
+ psraw xmm5, (WORD_BIT-1)
+ psraw xmm6, (WORD_BIT-1)
+ psraw xmm7, (WORD_BIT-1)
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
+ psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
+ psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
+ psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
+
+ paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
+ paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
+ paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
+ paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
+ pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
+ pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+ pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+ pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+ pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
+ pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
+ pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
+ pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+ psubw xmm2, xmm6
+ psubw xmm3, xmm7
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+ add esi, byte 32*SIZEOF_DCTELEM
+ add edx, byte 32*SIZEOF_DCTELEM
+ add edi, byte 32*SIZEOF_JCOEF
+ dec eax
+ jnz near .quantloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jsimd.c b/media/libjpeg/simd/i386/jsimd.c
new file mode 100644
index 0000000000..80bc821ff4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimd.c
@@ -0,0 +1,1246 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jconfigint.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = (unsigned int)(~0);
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1"))
+ simd_support &= JSIMD_MMX;
+ if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1"))
+ simd_support &= JSIMD_3DNOW | JSIMD_MMX;
+ if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1"))
+ simd_support &= JSIMD_SSE | JSIMD_MMX;
+ if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+ simd_support &= JSIMD_SSE2;
+ if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+ simd_support &= JSIMD_AVX2;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_extrgb_ycc_convert_avx2;
+ sse2fct = jsimd_extrgb_ycc_convert_sse2;
+ mmxfct = jsimd_extrgb_ycc_convert_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+ sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+ mmxfct = jsimd_extrgbx_ycc_convert_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_extbgr_ycc_convert_avx2;
+ sse2fct = jsimd_extbgr_ycc_convert_sse2;
+ mmxfct = jsimd_extbgr_ycc_convert_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+ sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+ mmxfct = jsimd_extbgrx_ycc_convert_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+ sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+ mmxfct = jsimd_extxbgr_ycc_convert_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+ sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+ mmxfct = jsimd_extxrgb_ycc_convert_mmx;
+ break;
+ default:
+ avx2fct = jsimd_rgb_ycc_convert_avx2;
+ sse2fct = jsimd_rgb_ycc_convert_sse2;
+ mmxfct = jsimd_rgb_ycc_convert_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else
+ mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_extrgb_gray_convert_avx2;
+ sse2fct = jsimd_extrgb_gray_convert_sse2;
+ mmxfct = jsimd_extrgb_gray_convert_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_extrgbx_gray_convert_avx2;
+ sse2fct = jsimd_extrgbx_gray_convert_sse2;
+ mmxfct = jsimd_extrgbx_gray_convert_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_extbgr_gray_convert_avx2;
+ sse2fct = jsimd_extbgr_gray_convert_sse2;
+ mmxfct = jsimd_extbgr_gray_convert_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_extbgrx_gray_convert_avx2;
+ sse2fct = jsimd_extbgrx_gray_convert_sse2;
+ mmxfct = jsimd_extbgrx_gray_convert_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_extxbgr_gray_convert_avx2;
+ sse2fct = jsimd_extxbgr_gray_convert_sse2;
+ mmxfct = jsimd_extxbgr_gray_convert_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_extxrgb_gray_convert_avx2;
+ sse2fct = jsimd_extxrgb_gray_convert_sse2;
+ mmxfct = jsimd_extxrgb_gray_convert_mmx;
+ break;
+ default:
+ avx2fct = jsimd_rgb_gray_convert_avx2;
+ sse2fct = jsimd_rgb_gray_convert_sse2;
+ mmxfct = jsimd_rgb_gray_convert_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else
+ mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+ void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_ycc_extrgb_convert_avx2;
+ sse2fct = jsimd_ycc_extrgb_convert_sse2;
+ mmxfct = jsimd_ycc_extrgb_convert_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+ sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+ mmxfct = jsimd_ycc_extrgbx_convert_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_ycc_extbgr_convert_avx2;
+ sse2fct = jsimd_ycc_extbgr_convert_sse2;
+ mmxfct = jsimd_ycc_extbgr_convert_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+ sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+ mmxfct = jsimd_ycc_extbgrx_convert_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+ sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+ mmxfct = jsimd_ycc_extxbgr_convert_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+ sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+ mmxfct = jsimd_ycc_extxrgb_convert_mmx;
+ break;
+ default:
+ avx2fct = jsimd_ycc_rgb_convert_avx2;
+ sse2fct = jsimd_ycc_rgb_convert_sse2;
+ mmxfct = jsimd_ycc_rgb_convert_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+ else
+ mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else
+ jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else
+ jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else
+ jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else
+ jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else
+ jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else
+ jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extrgb_merged_upsample_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extrgbx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extbgrx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extxbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extxrgb_merged_upsample_mmx;
+ break;
+ default:
+ avx2fct = jsimd_h2v2_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_merged_upsample_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else
+ mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extrgb_merged_upsample_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extrgbx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extbgrx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extxbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extxrgb_merged_upsample_mmx;
+ break;
+ default:
+ avx2fct = jsimd_h2v1_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_merged_upsample_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else
+ mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_SSE)
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_convsamp_avx2(sample_data, start_col, workspace);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_convsamp_sse2(sample_data, start_col, workspace);
+ else
+ jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+ else if (simd_support & JSIMD_SSE)
+ jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+ else
+ jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_fdct_islow_avx2(data);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_fdct_islow_sse2(data);
+ else
+ jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ jsimd_fdct_ifast_sse2(data);
+ else
+ jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+ jsimd_fdct_float_sse(data);
+ else if (simd_support & JSIMD_3DNOW)
+ jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_SSE)
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_quantize_avx2(coef_block, divisors, workspace);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_quantize_sse2(coef_block, divisors, workspace);
+ else
+ jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+ else if (simd_support & JSIMD_SSE)
+ jsimd_quantize_float_sse(coef_block, divisors, workspace);
+ else
+ jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+ if (sizeof(FLOAT_MULT_TYPE) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+ return 1;
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+ jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+ jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+ jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+ IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 4)
+ return 0;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 4)
+ return 0;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/i386/jsimdcpu.asm b/media/libjpeg/simd/i386/jsimdcpu.asm
new file mode 100644
index 0000000000..ddcafa9e21
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimdcpu.asm
@@ -0,0 +1,135 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+ align 32
+ GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+ push edi
+
+ xor edi, edi ; simd support flag
+
+ pushfd
+ pop eax
+ mov edx, eax
+ xor eax, 1<<21 ; flip ID bit in EFLAGS
+ push eax
+ popfd
+ pushfd
+ pop eax
+ xor eax, edx
+ jz near .return ; CPUID is not supported
+
+ ; Check whether CPUID leaf 07H is supported
+ ; (leaf 07H is used to check for AVX2 instruction support)
+ xor eax, eax
+ cpuid
+ test eax, eax
+ jz near .return
+ cmp eax, 7
+ jl short .no_avx2 ; Maximum leaf < 07H
+
+ ; Check for AVX2 instruction support
+ mov eax, 7
+ xor ecx, ecx
+ cpuid
+ mov eax, ebx
+ test eax, 1<<5 ; bit5:AVX2
+ jz short .no_avx2
+
+ ; Check for AVX2 O/S support
+ mov eax, 1
+ xor ecx, ecx
+ cpuid
+ test ecx, 1<<27
+ jz short .no_avx2 ; O/S does not support XSAVE
+ test ecx, 1<<28
+ jz short .no_avx2 ; CPU does not support AVX2
+
+ xor ecx, ecx
+ xgetbv
+ and eax, 6
+ cmp eax, 6 ; O/S does not manage XMM/YMM state
+ ; using XSAVE
+ jnz short .no_avx2
+
+ or edi, JSIMD_AVX2
+.no_avx2:
+
+ ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support
+ xor eax, eax
+ inc eax
+ cpuid
+ mov eax, edx ; eax = Standard feature flags
+
+ ; Check for MMX instruction support
+ test eax, 1<<23 ; bit23:MMX
+ jz short .no_mmx
+ or edi, byte JSIMD_MMX
+.no_mmx:
+ test eax, 1<<25 ; bit25:SSE
+ jz short .no_sse
+ or edi, byte JSIMD_SSE
+.no_sse:
+ test eax, 1<<26 ; bit26:SSE2
+ jz short .no_sse2
+ or edi, byte JSIMD_SSE2
+.no_sse2:
+
+ ; Check for 3DNow! instruction support
+ mov eax, 0x80000000
+ cpuid
+ cmp eax, 0x80000000
+ jbe short .return
+
+ mov eax, 0x80000001
+ cpuid
+ mov eax, edx ; eax = Extended feature flags
+
+ test eax, 1<<31 ; bit31:3DNow!(vendor independent)
+ jz short .no_3dnow
+ or edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+ mov eax, edi
+
+ pop edi
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/jccolext-mmx.asm b/media/libjpeg/simd/jccolext-mmx.asm
deleted file mode 100644
index 96a0372b1b..0000000000
--- a/media/libjpeg/simd/jccolext-mmx.asm
+++ /dev/null
@@ -1,476 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
-; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-; JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION img_width
-%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
-%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
-%define output_row(b) (b)+20 ; JDIMENSION output_row
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 8
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_rgb_ycc_convert_mmx)
-
-EXTN(jsimd_rgb_ycc_convert_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [img_width(eax)] ; num_cols
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov esi, JSAMPIMAGE [output_buf(eax)]
- mov ecx, JDIMENSION [output_row(eax)]
- mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
- lea edi, [edi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov esi, JSAMPARRAY [input_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- pushpic eax
- push edx
- push ebx
- push edi
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr0
- mov ebx, JSAMPROW [ebx] ; outptr1
- mov edx, JSAMPROW [edx] ; outptr2
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
-
- cmp ecx, byte SIZEOF_MMWORD
- jae short .columnloop
- alignx 16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
- push eax
- push edx
- lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub ecx, byte SIZEOF_BYTE
- xor eax,eax
- mov al, BYTE [esi+ecx]
-.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub ecx, byte SIZEOF_WORD
- xor edx,edx
- mov dx, WORD [esi+ecx]
- shl eax, WORD_BIT
- or eax,edx
-.column_ld4:
- movd mmA,eax
- pop edx
- pop eax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub ecx, byte SIZEOF_DWORD
- movd mmG, DWORD [esi+ecx]
- psllq mmA, DWORD_BIT
- por mmA,mmG
-.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- movq mmG,mmA
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- mov ecx, SIZEOF_MMWORD
- jmp short .rgb_ycc_cnv
-.column_ld16:
- test cl, 2*SIZEOF_MMWORD
- mov ecx, SIZEOF_MMWORD
- jz short .rgb_ycc_cnv
- movq mmF,mmA
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
-
-.columnloop:
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
- ; mmA=(00 10 20 01 11 21 02 12)
- ; mmG=(22 03 13 23 04 14 24 05)
- ; mmF=(15 25 06 16 26 07 17 27)
-
- movq mmD,mmA
- psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
- psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
-
- punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05)
- psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
-
- punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16)
- punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27)
-
- movq mmE,mmA
- psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
- psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
-
- punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
- psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
-
- punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07)
- punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27)
-
- pxor mmH,mmH
-
- movq mmC,mmA
- punpcklbw mmA,mmH ; mmA=(00 02 04 06)
- punpckhbw mmC,mmH ; mmC=(10 12 14 16)
-
- movq mmB,mmE
- punpcklbw mmE,mmH ; mmE=(20 22 24 26)
- punpckhbw mmB,mmH ; mmB=(01 03 05 07)
-
- movq mmF,mmD
- punpcklbw mmD,mmH ; mmD=(11 13 15 17)
- punpckhbw mmF,mmH ; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
- test cl, SIZEOF_MMWORD/8
- jz short .column_ld2
- sub ecx, byte SIZEOF_MMWORD/8
- movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
- test cl, SIZEOF_MMWORD/4
- jz short .column_ld4
- sub ecx, byte SIZEOF_MMWORD/4
- movq mmF,mmA
- movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
- test cl, SIZEOF_MMWORD/2
- mov ecx, SIZEOF_MMWORD
- jz short .rgb_ycc_cnv
- movq mmD,mmA
- movq mmC,mmF
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
-
-.columnloop:
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
- movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
- movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
- ; mmA=(00 10 20 30 01 11 21 31)
- ; mmF=(02 12 22 32 03 13 23 33)
- ; mmD=(04 14 24 34 05 15 25 35)
- ; mmC=(06 16 26 36 07 17 27 37)
-
- movq mmB,mmA
- punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32)
- punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33)
-
- movq mmG,mmD
- punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36)
- punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37)
-
- movq mmE,mmA
- punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
- punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36)
-
- movq mmH,mmB
- punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17)
- punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37)
-
- pxor mmF,mmF
-
- movq mmC,mmA
- punpcklbw mmA,mmF ; mmA=(00 02 04 06)
- punpckhbw mmC,mmF ; mmC=(10 12 14 16)
-
- movq mmD,mmB
- punpcklbw mmB,mmF ; mmB=(01 03 05 07)
- punpckhbw mmD,mmF ; mmD=(11 13 15 17)
-
- movq mmG,mmE
- punpcklbw mmE,mmF ; mmE=(20 22 24 26)
- punpckhbw mmG,mmF ; mmG=(30 32 34 36)
-
- punpcklbw mmF,mmH
- punpckhbw mmH,mmH
- psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27)
- psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
- ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
- movq MMWORD [wk(0)], mm0 ; wk(0)=RE
- movq MMWORD [wk(1)], mm1 ; wk(1)=RO
- movq MMWORD [wk(2)], mm4 ; wk(2)=BE
- movq MMWORD [wk(3)], mm5 ; wk(3)=BO
-
- movq mm6,mm1
- punpcklwd mm1,mm3
- punpckhwd mm6,mm3
- movq mm7,mm1
- movq mm4,mm6
- pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
- pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
- pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
- movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
- movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- pxor mm1,mm1
- pxor mm6,mm6
- punpcklwd mm1,mm5 ; mm1=BOL
- punpckhwd mm6,mm5 ; mm6=BOH
- psrld mm1,1 ; mm1=BOL*FIX(0.500)
- psrld mm6,1 ; mm6=BOH*FIX(0.500)
-
- movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
-
- paddd mm7,mm1
- paddd mm4,mm6
- paddd mm7,mm5
- paddd mm4,mm5
- psrld mm7,SCALEBITS ; mm7=CbOL
- psrld mm4,SCALEBITS ; mm4=CbOH
- packssdw mm7,mm4 ; mm7=CbO
-
- movq mm1, MMWORD [wk(2)] ; mm1=BE
-
- movq mm6,mm0
- punpcklwd mm0,mm2
- punpckhwd mm6,mm2
- movq mm5,mm0
- movq mm4,mm6
- pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
- pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
- pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
- movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
- movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- pxor mm0,mm0
- pxor mm6,mm6
- punpcklwd mm0,mm1 ; mm0=BEL
- punpckhwd mm6,mm1 ; mm6=BEH
- psrld mm0,1 ; mm0=BEL*FIX(0.500)
- psrld mm6,1 ; mm6=BEH*FIX(0.500)
-
- movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
- paddd mm5,mm0
- paddd mm4,mm6
- paddd mm5,mm1
- paddd mm4,mm1
- psrld mm5,SCALEBITS ; mm5=CbEL
- psrld mm4,SCALEBITS ; mm4=CbEH
- packssdw mm5,mm4 ; mm5=CbE
-
- psllw mm7,BYTE_BIT
- por mm5,mm7 ; mm5=Cb
- movq MMWORD [ebx], mm5 ; Save Cb
-
- movq mm0, MMWORD [wk(3)] ; mm0=BO
- movq mm6, MMWORD [wk(2)] ; mm6=BE
- movq mm1, MMWORD [wk(1)] ; mm1=RO
-
- movq mm4,mm0
- punpcklwd mm0,mm3
- punpckhwd mm4,mm3
- movq mm7,mm0
- movq mm5,mm4
- pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
- pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
- pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
- movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
-
- paddd mm0, MMWORD [wk(4)]
- paddd mm4, MMWORD [wk(5)]
- paddd mm0,mm3
- paddd mm4,mm3
- psrld mm0,SCALEBITS ; mm0=YOL
- psrld mm4,SCALEBITS ; mm4=YOH
- packssdw mm0,mm4 ; mm0=YO
-
- pxor mm3,mm3
- pxor mm4,mm4
- punpcklwd mm3,mm1 ; mm3=ROL
- punpckhwd mm4,mm1 ; mm4=ROH
- psrld mm3,1 ; mm3=ROL*FIX(0.500)
- psrld mm4,1 ; mm4=ROH*FIX(0.500)
-
- movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
- paddd mm7,mm3
- paddd mm5,mm4
- paddd mm7,mm1
- paddd mm5,mm1
- psrld mm7,SCALEBITS ; mm7=CrOL
- psrld mm5,SCALEBITS ; mm5=CrOH
- packssdw mm7,mm5 ; mm7=CrO
-
- movq mm3, MMWORD [wk(0)] ; mm3=RE
-
- movq mm4,mm6
- punpcklwd mm6,mm2
- punpckhwd mm4,mm2
- movq mm1,mm6
- movq mm5,mm4
- pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
- pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
- pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
- movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
-
- paddd mm6, MMWORD [wk(6)]
- paddd mm4, MMWORD [wk(7)]
- paddd mm6,mm2
- paddd mm4,mm2
- psrld mm6,SCALEBITS ; mm6=YEL
- psrld mm4,SCALEBITS ; mm4=YEH
- packssdw mm6,mm4 ; mm6=YE
-
- psllw mm0,BYTE_BIT
- por mm6,mm0 ; mm6=Y
- movq MMWORD [edi], mm6 ; Save Y
-
- pxor mm2,mm2
- pxor mm4,mm4
- punpcklwd mm2,mm3 ; mm2=REL
- punpckhwd mm4,mm3 ; mm4=REH
- psrld mm2,1 ; mm2=REL*FIX(0.500)
- psrld mm4,1 ; mm4=REH*FIX(0.500)
-
- movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
-
- paddd mm1,mm2
- paddd mm5,mm4
- paddd mm1,mm0
- paddd mm5,mm0
- psrld mm1,SCALEBITS ; mm1=CrEL
- psrld mm5,SCALEBITS ; mm5=CrEH
- packssdw mm1,mm5 ; mm1=CrE
-
- psllw mm7,BYTE_BIT
- por mm1,mm7 ; mm1=Cr
- movq MMWORD [edx], mm1 ; Save Cr
-
- sub ecx, byte SIZEOF_MMWORD
- add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
- add edi, byte SIZEOF_MMWORD ; outptr0
- add ebx, byte SIZEOF_MMWORD ; outptr1
- add edx, byte SIZEOF_MMWORD ; outptr2
- cmp ecx, byte SIZEOF_MMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .column_ld1
-
- pop ecx ; col
- pop esi
- pop edi
- pop ebx
- pop edx
- poppic eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_buf
- add edi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- dec eax ; num_rows
- jg near .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jccolext-sse2-64.asm b/media/libjpeg/simd/jccolext-sse2-64.asm
deleted file mode 100644
index 8e4642d3bc..0000000000
--- a/media/libjpeg/simd/jccolext-sse2-64.asm
+++ /dev/null
@@ -1,486 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-; JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 8
-
- align 16
-
- global EXTN(jsimd_rgb_ycc_convert_sse2)
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov ecx, r10d
- test rcx,rcx
- jz near .return
-
- push rcx
-
- mov rsi, r12
- mov ecx, r13d
- mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
- lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
- lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
- lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
- pop rcx
-
- mov rsi, r11
- mov eax, r14d
- test rax,rax
- jle near .return
-.rowloop:
- push rdx
- push rbx
- push rdi
- push rsi
- push rcx ; col
-
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr0
- mov rbx, JSAMPROW [rbx] ; outptr1
- mov rdx, JSAMPROW [rdx] ; outptr2
-
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
- push rax
- push rdx
- lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub rcx, byte SIZEOF_BYTE
- movzx rax, BYTE [rsi+rcx]
-.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub rcx, byte SIZEOF_WORD
- movzx rdx, WORD [rsi+rcx]
- shl rax, WORD_BIT
- or rax,rdx
-.column_ld4:
- movd xmmA,eax
- pop rdx
- pop rax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub rcx, byte SIZEOF_DWORD
- movd xmmF, XMM_DWORD [rsi+rcx]
- pslldq xmmA, SIZEOF_DWORD
- por xmmA,xmmF
-.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- sub rcx, byte SIZEOF_MMWORD
- movq xmmB, XMM_MMWORD [rsi+rcx]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmB
-.column_ld16:
- test cl, SIZEOF_XMMWORD
- jz short .column_ld32
- movdqa xmmF,xmmA
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- mov rcx, SIZEOF_XMMWORD
- jmp short .rgb_ycc_cnv
-.column_ld32:
- test cl, 2*SIZEOF_XMMWORD
- mov rcx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmB,xmmA
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
-
-.columnloop:
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
- ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- movdqa xmmG,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
- psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
- pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
- punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
- punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
- movdqa xmmD,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
- psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
- pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
- punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
- punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
- movdqa xmmE,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
- psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
- punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
- pxor xmmH,xmmH
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmB,xmmE
- punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
- movdqa xmmF,xmmD
- punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
- test cl, SIZEOF_XMMWORD/16
- jz short .column_ld2
- sub rcx, byte SIZEOF_XMMWORD/16
- movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
- test cl, SIZEOF_XMMWORD/8
- jz short .column_ld4
- sub rcx, byte SIZEOF_XMMWORD/8
- movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmE
-.column_ld4:
- test cl, SIZEOF_XMMWORD/4
- jz short .column_ld8
- sub rcx, byte SIZEOF_XMMWORD/4
- movdqa xmmE,xmmA
- movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
- test cl, SIZEOF_XMMWORD/2
- mov rcx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmF,xmmA
- movdqa xmmH,xmmE
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
-
-.columnloop:
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
- movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
- ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
- punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
- movdqa xmmC,xmmF
- punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
- punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
- movdqa xmmB,xmmA
- punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
- punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
- movdqa xmmG,xmmD
- punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
- punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
- movdqa xmmE,xmmA
- punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
- movdqa xmmH,xmmB
- punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
- pxor xmmF,xmmF
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmD,xmmB
- punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
- movdqa xmmG,xmmE
- punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
- punpcklbw xmmF,xmmH
- punpckhbw xmmH,xmmH
- psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
- psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
- ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
- movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
- movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
- movdqa xmm6,xmm1
- punpcklwd xmm1,xmm3
- punpckhwd xmm6,xmm3
- movdqa xmm7,xmm1
- movdqa xmm4,xmm6
- pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
- pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
- pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
- movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
- movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- pxor xmm1,xmm1
- pxor xmm6,xmm6
- punpcklwd xmm1,xmm5 ; xmm1=BOL
- punpckhwd xmm6,xmm5 ; xmm6=BOH
- psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
-
- movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm1
- paddd xmm4,xmm6
- paddd xmm7,xmm5
- paddd xmm4,xmm5
- psrld xmm7,SCALEBITS ; xmm7=CbOL
- psrld xmm4,SCALEBITS ; xmm4=CbOH
- packssdw xmm7,xmm4 ; xmm7=CbO
-
- movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
- movdqa xmm6,xmm0
- punpcklwd xmm0,xmm2
- punpckhwd xmm6,xmm2
- movdqa xmm5,xmm0
- movdqa xmm4,xmm6
- pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
- pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
- pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
- movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
- movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- pxor xmm0,xmm0
- pxor xmm6,xmm6
- punpcklwd xmm0,xmm1 ; xmm0=BEL
- punpckhwd xmm6,xmm1 ; xmm6=BEH
- psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
-
- movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm5,xmm0
- paddd xmm4,xmm6
- paddd xmm5,xmm1
- paddd xmm4,xmm1
- psrld xmm5,SCALEBITS ; xmm5=CbEL
- psrld xmm4,SCALEBITS ; xmm4=CbEH
- packssdw xmm5,xmm4 ; xmm5=CbE
-
- psllw xmm7,BYTE_BIT
- por xmm5,xmm7 ; xmm5=Cb
- movdqa XMMWORD [rbx], xmm5 ; Save Cb
-
- movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
- movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
- movdqa xmm4,xmm0
- punpcklwd xmm0,xmm3
- punpckhwd xmm4,xmm3
- movdqa xmm7,xmm0
- movdqa xmm5,xmm4
- pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
- pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
- pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
- movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
- paddd xmm0, XMMWORD [wk(4)]
- paddd xmm4, XMMWORD [wk(5)]
- paddd xmm0,xmm3
- paddd xmm4,xmm3
- psrld xmm0,SCALEBITS ; xmm0=YOL
- psrld xmm4,SCALEBITS ; xmm4=YOH
- packssdw xmm0,xmm4 ; xmm0=YO
-
- pxor xmm3,xmm3
- pxor xmm4,xmm4
- punpcklwd xmm3,xmm1 ; xmm3=ROL
- punpckhwd xmm4,xmm1 ; xmm4=ROH
- psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
- psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
-
- movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm3
- paddd xmm5,xmm4
- paddd xmm7,xmm1
- paddd xmm5,xmm1
- psrld xmm7,SCALEBITS ; xmm7=CrOL
- psrld xmm5,SCALEBITS ; xmm5=CrOH
- packssdw xmm7,xmm5 ; xmm7=CrO
-
- movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
- movdqa xmm4,xmm6
- punpcklwd xmm6,xmm2
- punpckhwd xmm4,xmm2
- movdqa xmm1,xmm6
- movdqa xmm5,xmm4
- pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
- pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
- pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
- movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
- paddd xmm6, XMMWORD [wk(6)]
- paddd xmm4, XMMWORD [wk(7)]
- paddd xmm6,xmm2
- paddd xmm4,xmm2
- psrld xmm6,SCALEBITS ; xmm6=YEL
- psrld xmm4,SCALEBITS ; xmm4=YEH
- packssdw xmm6,xmm4 ; xmm6=YE
-
- psllw xmm0,BYTE_BIT
- por xmm6,xmm0 ; xmm6=Y
- movdqa XMMWORD [rdi], xmm6 ; Save Y
-
- pxor xmm2,xmm2
- pxor xmm4,xmm4
- punpcklwd xmm2,xmm3 ; xmm2=REL
- punpckhwd xmm4,xmm3 ; xmm4=REH
- psrld xmm2,1 ; xmm2=REL*FIX(0.500)
- psrld xmm4,1 ; xmm4=REH*FIX(0.500)
-
- movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
-
- paddd xmm1,xmm2
- paddd xmm5,xmm4
- paddd xmm1,xmm0
- paddd xmm5,xmm0
- psrld xmm1,SCALEBITS ; xmm1=CrEL
- psrld xmm5,SCALEBITS ; xmm5=CrEH
- packssdw xmm1,xmm5 ; xmm1=CrE
-
- psllw xmm7,BYTE_BIT
- por xmm1,xmm7 ; xmm1=Cr
- movdqa XMMWORD [rdx], xmm1 ; Save Cr
-
- sub rcx, byte SIZEOF_XMMWORD
- add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
- add rdi, byte SIZEOF_XMMWORD ; outptr0
- add rbx, byte SIZEOF_XMMWORD ; outptr1
- add rdx, byte SIZEOF_XMMWORD ; outptr2
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test rcx,rcx
- jnz near .column_ld1
-
- pop rcx ; col
- pop rsi
- pop rdi
- pop rbx
- pop rdx
-
- add rsi, byte SIZEOF_JSAMPROW ; input_buf
- add rdi, byte SIZEOF_JSAMPROW
- add rbx, byte SIZEOF_JSAMPROW
- add rdx, byte SIZEOF_JSAMPROW
- dec rax ; num_rows
- jg near .rowloop
-
-.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jccolext-sse2.asm b/media/libjpeg/simd/jccolext-sse2.asm
deleted file mode 100644
index cc38e98a18..0000000000
--- a/media/libjpeg/simd/jccolext-sse2.asm
+++ /dev/null
@@ -1,503 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-; JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION img_width
-%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
-%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
-%define output_row(b) (b)+20 ; JDIMENSION output_row
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 8
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
-
- global EXTN(jsimd_rgb_ycc_convert_sse2)
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [img_width(eax)]
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov esi, JSAMPIMAGE [output_buf(eax)]
- mov ecx, JDIMENSION [output_row(eax)]
- mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
- lea edi, [edi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov esi, JSAMPARRAY [input_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- pushpic eax
- push edx
- push ebx
- push edi
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr0
- mov ebx, JSAMPROW [ebx] ; outptr1
- mov edx, JSAMPROW [edx] ; outptr2
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- alignx 16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
- push eax
- push edx
- lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub ecx, byte SIZEOF_BYTE
- movzx eax, BYTE [esi+ecx]
-.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub ecx, byte SIZEOF_WORD
- movzx edx, WORD [esi+ecx]
- shl eax, WORD_BIT
- or eax,edx
-.column_ld4:
- movd xmmA,eax
- pop edx
- pop eax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub ecx, byte SIZEOF_DWORD
- movd xmmF, XMM_DWORD [esi+ecx]
- pslldq xmmA, SIZEOF_DWORD
- por xmmA,xmmF
-.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- sub ecx, byte SIZEOF_MMWORD
- movq xmmB, XMM_MMWORD [esi+ecx]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmB
-.column_ld16:
- test cl, SIZEOF_XMMWORD
- jz short .column_ld32
- movdqa xmmF,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- mov ecx, SIZEOF_XMMWORD
- jmp short .rgb_ycc_cnv
-.column_ld32:
- test cl, 2*SIZEOF_XMMWORD
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmB,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
-
-.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
- ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- movdqa xmmG,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
- psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
- pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
- punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
- punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
- movdqa xmmD,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
- psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
- pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
- punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
- punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
- movdqa xmmE,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
- psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
- punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
- pxor xmmH,xmmH
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmB,xmmE
- punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
- movdqa xmmF,xmmD
- punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
- test cl, SIZEOF_XMMWORD/16
- jz short .column_ld2
- sub ecx, byte SIZEOF_XMMWORD/16
- movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
- test cl, SIZEOF_XMMWORD/8
- jz short .column_ld4
- sub ecx, byte SIZEOF_XMMWORD/8
- movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmE
-.column_ld4:
- test cl, SIZEOF_XMMWORD/4
- jz short .column_ld8
- sub ecx, byte SIZEOF_XMMWORD/4
- movdqa xmmE,xmmA
- movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
- test cl, SIZEOF_XMMWORD/2
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmF,xmmA
- movdqa xmmH,xmmE
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
-
-.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
- movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
- ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
- punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
- movdqa xmmC,xmmF
- punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
- punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
- movdqa xmmB,xmmA
- punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
- punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
- movdqa xmmG,xmmD
- punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
- punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
- movdqa xmmE,xmmA
- punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
- movdqa xmmH,xmmB
- punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
- pxor xmmF,xmmF
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmD,xmmB
- punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
- movdqa xmmG,xmmE
- punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
- punpcklbw xmmF,xmmH
- punpckhbw xmmH,xmmH
- psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
- psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
- ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
- movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
- movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
- movdqa xmm6,xmm1
- punpcklwd xmm1,xmm3
- punpckhwd xmm6,xmm3
- movdqa xmm7,xmm1
- movdqa xmm4,xmm6
- pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
- pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
- pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
- movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
- movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- pxor xmm1,xmm1
- pxor xmm6,xmm6
- punpcklwd xmm1,xmm5 ; xmm1=BOL
- punpckhwd xmm6,xmm5 ; xmm6=BOH
- psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
-
- movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm1
- paddd xmm4,xmm6
- paddd xmm7,xmm5
- paddd xmm4,xmm5
- psrld xmm7,SCALEBITS ; xmm7=CbOL
- psrld xmm4,SCALEBITS ; xmm4=CbOH
- packssdw xmm7,xmm4 ; xmm7=CbO
-
- movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
- movdqa xmm6,xmm0
- punpcklwd xmm0,xmm2
- punpckhwd xmm6,xmm2
- movdqa xmm5,xmm0
- movdqa xmm4,xmm6
- pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
- pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
- pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
- movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
- movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- pxor xmm0,xmm0
- pxor xmm6,xmm6
- punpcklwd xmm0,xmm1 ; xmm0=BEL
- punpckhwd xmm6,xmm1 ; xmm6=BEH
- psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
-
- movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm5,xmm0
- paddd xmm4,xmm6
- paddd xmm5,xmm1
- paddd xmm4,xmm1
- psrld xmm5,SCALEBITS ; xmm5=CbEL
- psrld xmm4,SCALEBITS ; xmm4=CbEH
- packssdw xmm5,xmm4 ; xmm5=CbE
-
- psllw xmm7,BYTE_BIT
- por xmm5,xmm7 ; xmm5=Cb
- movdqa XMMWORD [ebx], xmm5 ; Save Cb
-
- movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
- movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
- movdqa xmm4,xmm0
- punpcklwd xmm0,xmm3
- punpckhwd xmm4,xmm3
- movdqa xmm7,xmm0
- movdqa xmm5,xmm4
- pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
- pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
- pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
- movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
- paddd xmm0, XMMWORD [wk(4)]
- paddd xmm4, XMMWORD [wk(5)]
- paddd xmm0,xmm3
- paddd xmm4,xmm3
- psrld xmm0,SCALEBITS ; xmm0=YOL
- psrld xmm4,SCALEBITS ; xmm4=YOH
- packssdw xmm0,xmm4 ; xmm0=YO
-
- pxor xmm3,xmm3
- pxor xmm4,xmm4
- punpcklwd xmm3,xmm1 ; xmm3=ROL
- punpckhwd xmm4,xmm1 ; xmm4=ROH
- psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
- psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
-
- movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm3
- paddd xmm5,xmm4
- paddd xmm7,xmm1
- paddd xmm5,xmm1
- psrld xmm7,SCALEBITS ; xmm7=CrOL
- psrld xmm5,SCALEBITS ; xmm5=CrOH
- packssdw xmm7,xmm5 ; xmm7=CrO
-
- movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
- movdqa xmm4,xmm6
- punpcklwd xmm6,xmm2
- punpckhwd xmm4,xmm2
- movdqa xmm1,xmm6
- movdqa xmm5,xmm4
- pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
- pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
- pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
- movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
- paddd xmm6, XMMWORD [wk(6)]
- paddd xmm4, XMMWORD [wk(7)]
- paddd xmm6,xmm2
- paddd xmm4,xmm2
- psrld xmm6,SCALEBITS ; xmm6=YEL
- psrld xmm4,SCALEBITS ; xmm4=YEH
- packssdw xmm6,xmm4 ; xmm6=YE
-
- psllw xmm0,BYTE_BIT
- por xmm6,xmm0 ; xmm6=Y
- movdqa XMMWORD [edi], xmm6 ; Save Y
-
- pxor xmm2,xmm2
- pxor xmm4,xmm4
- punpcklwd xmm2,xmm3 ; xmm2=REL
- punpckhwd xmm4,xmm3 ; xmm4=REH
- psrld xmm2,1 ; xmm2=REL*FIX(0.500)
- psrld xmm4,1 ; xmm4=REH*FIX(0.500)
-
- movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
-
- paddd xmm1,xmm2
- paddd xmm5,xmm4
- paddd xmm1,xmm0
- paddd xmm5,xmm0
- psrld xmm1,SCALEBITS ; xmm1=CrEL
- psrld xmm5,SCALEBITS ; xmm5=CrEH
- packssdw xmm1,xmm5 ; xmm1=CrE
-
- psllw xmm7,BYTE_BIT
- por xmm1,xmm7 ; xmm1=Cr
- movdqa XMMWORD [edx], xmm1 ; Save Cr
-
- sub ecx, byte SIZEOF_XMMWORD
- add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
- add edi, byte SIZEOF_XMMWORD ; outptr0
- add ebx, byte SIZEOF_XMMWORD ; outptr1
- add edx, byte SIZEOF_XMMWORD ; outptr2
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .column_ld1
-
- pop ecx ; col
- pop esi
- pop edi
- pop ebx
- pop edx
- poppic eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_buf
- add edi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- dec eax ; num_rows
- jg near .rowloop
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jccolor-altivec.c b/media/libjpeg/simd/jccolor-altivec.c
deleted file mode 100644
index ec473320e5..0000000000
--- a/media/libjpeg/simd/jccolor-altivec.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* RGB --> YCC CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_081 5329 /* FIX(0.08131) */
-#define F_0_114 7471 /* FIX(0.11400) */
-#define F_0_168 11059 /* FIX(0.16874) */
-#define F_0_250 16384 /* FIX(0.25000) */
-#define F_0_299 19595 /* FIX(0.29900) */
-#define F_0_331 21709 /* FIX(0.33126) */
-#define F_0_418 27439 /* FIX(0.41869) */
-#define F_0_500 32768 /* FIX(0.50000) */
-#define F_0_587 38470 /* FIX(0.58700) */
-#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-
-#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
-#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
-#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
-#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
-#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
-#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
-#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
diff --git a/media/libjpeg/simd/jccolor-mmx.asm b/media/libjpeg/simd/jccolor-mmx.asm
deleted file mode 100644
index c4e6d88be3..0000000000
--- a/media/libjpeg/simd/jccolor-mmx.asm
+++ /dev/null
@@ -1,122 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_081 equ 5329 ; FIX(0.08131)
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_168 equ 11059 ; FIX(0.16874)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_331 equ 21709 ; FIX(0.33126)
-F_0_418 equ 27439 ; FIX(0.41869)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_rgb_ycc_convert_mmx)
-
-EXTN(jconst_rgb_ycc_convert_mmx):
-
-PW_F0299_F0337 times 2 dw F_0_299, F_0_337
-PW_F0114_F0250 times 2 dw F_0_114, F_0_250
-PW_MF016_MF033 times 2 dw -F_0_168,-F_0_331
-PW_MF008_MF041 times 2 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
-%include "jccolext-mmx.asm"
diff --git a/media/libjpeg/simd/jccolor-sse2-64.asm b/media/libjpeg/simd/jccolor-sse2-64.asm
deleted file mode 100644
index bd2188b4c0..0000000000
--- a/media/libjpeg/simd/jccolor-sse2-64.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_081 equ 5329 ; FIX(0.08131)
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_168 equ 11059 ; FIX(0.16874)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_331 equ 21709 ; FIX(0.33126)
-F_0_418 equ 27439 ; FIX(0.41869)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_rgb_ycc_convert_sse2)
-
-EXTN(jconst_rgb_ycc_convert_sse2):
-
-PW_F0299_F0337 times 4 dw F_0_299, F_0_337
-PW_F0114_F0250 times 4 dw F_0_114, F_0_250
-PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jccolor-sse2.asm b/media/libjpeg/simd/jccolor-sse2.asm
deleted file mode 100644
index 13124d13d7..0000000000
--- a/media/libjpeg/simd/jccolor-sse2.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_081 equ 5329 ; FIX(0.08131)
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_168 equ 11059 ; FIX(0.16874)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_331 equ 21709 ; FIX(0.33126)
-F_0_418 equ 27439 ; FIX(0.41869)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_rgb_ycc_convert_sse2)
-
-EXTN(jconst_rgb_ycc_convert_sse2):
-
-PW_F0299_F0337 times 4 dw F_0_299, F_0_337
-PW_F0114_F0250 times 4 dw F_0_114, F_0_250
-PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/jcgray-altivec.c b/media/libjpeg/simd/jcgray-altivec.c
deleted file mode 100644
index 684df5ef1e..0000000000
--- a/media/libjpeg/simd/jcgray-altivec.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* RGB --> GRAYSCALE CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_114 7471 /* FIX(0.11400) */
-#define F_0_250 16384 /* FIX(0.25000) */
-#define F_0_299 19595 /* FIX(0.29900) */
-#define F_0_587 38470 /* FIX(0.58700) */
-#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-
-#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
-#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
-#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
-#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
-#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
-#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
-#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
-#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
-#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
-#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
-#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
-#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
diff --git a/media/libjpeg/simd/jcgray-mmx.asm b/media/libjpeg/simd/jcgray-mmx.asm
deleted file mode 100644
index 0819b6ca01..0000000000
--- a/media/libjpeg/simd/jcgray-mmx.asm
+++ /dev/null
@@ -1,115 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_rgb_gray_convert_mmx)
-
-EXTN(jconst_rgb_gray_convert_mmx):
-
-PW_F0299_F0337 times 2 dw F_0_299, F_0_337
-PW_F0114_F0250 times 2 dw F_0_114, F_0_250
-PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
-%include "jcgryext-mmx.asm"
diff --git a/media/libjpeg/simd/jcgray-sse2-64.asm b/media/libjpeg/simd/jcgray-sse2-64.asm
deleted file mode 100644
index bafd302aa5..0000000000
--- a/media/libjpeg/simd/jcgray-sse2-64.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_rgb_gray_convert_sse2)
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337 times 4 dw F_0_299, F_0_337
-PW_F0114_F0250 times 4 dw F_0_114, F_0_250
-PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jcgray-sse2.asm b/media/libjpeg/simd/jcgray-sse2.asm
deleted file mode 100644
index 5b0b466953..0000000000
--- a/media/libjpeg/simd/jcgray-sse2.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_rgb_gray_convert_sse2)
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337 times 4 dw F_0_299, F_0_337
-PW_F0114_F0250 times 4 dw F_0_114, F_0_250
-PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/jcgryext-mmx.asm b/media/libjpeg/simd/jcgryext-mmx.asm
deleted file mode 100644
index 1c1b8d8bc4..0000000000
--- a/media/libjpeg/simd/jcgryext-mmx.asm
+++ /dev/null
@@ -1,356 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
-; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-; JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION img_width
-%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
-%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
-%define output_row(b) (b)+20 ; JDIMENSION output_row
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 2
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_rgb_gray_convert_mmx)
-
-EXTN(jsimd_rgb_gray_convert_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [img_width(eax)] ; num_cols
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov esi, JSAMPIMAGE [output_buf(eax)]
- mov ecx, JDIMENSION [output_row(eax)]
- mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
- lea edi, [edi+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov esi, JSAMPARRAY [input_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- pushpic eax
- push edi
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr0
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
-
- cmp ecx, byte SIZEOF_MMWORD
- jae short .columnloop
- alignx 16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
- push eax
- push edx
- lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub ecx, byte SIZEOF_BYTE
- xor eax,eax
- mov al, BYTE [esi+ecx]
-.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub ecx, byte SIZEOF_WORD
- xor edx,edx
- mov dx, WORD [esi+ecx]
- shl eax, WORD_BIT
- or eax,edx
-.column_ld4:
- movd mmA,eax
- pop edx
- pop eax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub ecx, byte SIZEOF_DWORD
- movd mmG, DWORD [esi+ecx]
- psllq mmA, DWORD_BIT
- por mmA,mmG
-.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- movq mmG,mmA
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- mov ecx, SIZEOF_MMWORD
- jmp short .rgb_gray_cnv
-.column_ld16:
- test cl, 2*SIZEOF_MMWORD
- mov ecx, SIZEOF_MMWORD
- jz short .rgb_gray_cnv
- movq mmF,mmA
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
- jmp short .rgb_gray_cnv
- alignx 16,7
-
-.columnloop:
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
- ; mmA=(00 10 20 01 11 21 02 12)
- ; mmG=(22 03 13 23 04 14 24 05)
- ; mmF=(15 25 06 16 26 07 17 27)
-
- movq mmD,mmA
- psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
- psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
-
- punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05)
- psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
-
- punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16)
- punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27)
-
- movq mmE,mmA
- psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
- psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
-
- punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
- psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
-
- punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07)
- punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27)
-
- pxor mmH,mmH
-
- movq mmC,mmA
- punpcklbw mmA,mmH ; mmA=(00 02 04 06)
- punpckhbw mmC,mmH ; mmC=(10 12 14 16)
-
- movq mmB,mmE
- punpcklbw mmE,mmH ; mmE=(20 22 24 26)
- punpckhbw mmB,mmH ; mmB=(01 03 05 07)
-
- movq mmF,mmD
- punpcklbw mmD,mmH ; mmD=(11 13 15 17)
- punpckhbw mmF,mmH ; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
- test cl, SIZEOF_MMWORD/8
- jz short .column_ld2
- sub ecx, byte SIZEOF_MMWORD/8
- movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
- test cl, SIZEOF_MMWORD/4
- jz short .column_ld4
- sub ecx, byte SIZEOF_MMWORD/4
- movq mmF,mmA
- movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
- test cl, SIZEOF_MMWORD/2
- mov ecx, SIZEOF_MMWORD
- jz short .rgb_gray_cnv
- movq mmD,mmA
- movq mmC,mmF
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
- jmp short .rgb_gray_cnv
- alignx 16,7
-
-.columnloop:
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
- movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
- movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
- ; mmA=(00 10 20 30 01 11 21 31)
- ; mmF=(02 12 22 32 03 13 23 33)
- ; mmD=(04 14 24 34 05 15 25 35)
- ; mmC=(06 16 26 36 07 17 27 37)
-
- movq mmB,mmA
- punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32)
- punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33)
-
- movq mmG,mmD
- punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36)
- punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37)
-
- movq mmE,mmA
- punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
- punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36)
-
- movq mmH,mmB
- punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17)
- punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37)
-
- pxor mmF,mmF
-
- movq mmC,mmA
- punpcklbw mmA,mmF ; mmA=(00 02 04 06)
- punpckhbw mmC,mmF ; mmC=(10 12 14 16)
-
- movq mmD,mmB
- punpcklbw mmB,mmF ; mmB=(01 03 05 07)
- punpckhbw mmD,mmF ; mmD=(11 13 15 17)
-
- movq mmG,mmE
- punpcklbw mmE,mmF ; mmE=(20 22 24 26)
- punpckhbw mmG,mmF ; mmG=(30 32 34 36)
-
- punpcklbw mmF,mmH
- punpckhbw mmH,mmH
- psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27)
- psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
- ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
- movq mm6,mm1
- punpcklwd mm1,mm3
- punpckhwd mm6,mm3
- pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movq mm6,mm0
- punpcklwd mm0,mm2
- punpckhwd mm6,mm2
- pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
- movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movq mm0, mm5 ; mm0=BO
- movq mm6, mm4 ; mm6=BE
-
- movq mm4,mm0
- punpcklwd mm0,mm3
- punpckhwd mm4,mm3
- pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
- movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
-
- paddd mm0, mm1
- paddd mm4, mm7
- paddd mm0,mm3
- paddd mm4,mm3
- psrld mm0,SCALEBITS ; mm0=YOL
- psrld mm4,SCALEBITS ; mm4=YOH
- packssdw mm0,mm4 ; mm0=YO
-
- movq mm4,mm6
- punpcklwd mm6,mm2
- punpckhwd mm4,mm2
- pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
- movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
-
- paddd mm6, MMWORD [wk(0)]
- paddd mm4, MMWORD [wk(1)]
- paddd mm6,mm2
- paddd mm4,mm2
- psrld mm6,SCALEBITS ; mm6=YEL
- psrld mm4,SCALEBITS ; mm4=YEH
- packssdw mm6,mm4 ; mm6=YE
-
- psllw mm0,BYTE_BIT
- por mm6,mm0 ; mm6=Y
- movq MMWORD [edi], mm6 ; Save Y
-
- sub ecx, byte SIZEOF_MMWORD
- add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
- add edi, byte SIZEOF_MMWORD ; outptr0
- cmp ecx, byte SIZEOF_MMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .column_ld1
-
- pop ecx ; col
- pop esi
- pop edi
- poppic eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_buf
- add edi, byte SIZEOF_JSAMPROW
- dec eax ; num_rows
- jg near .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jcgryext-sse2-64.asm b/media/libjpeg/simd/jcgryext-sse2-64.asm
deleted file mode 100644
index 541355af86..0000000000
--- a/media/libjpeg/simd/jcgryext-sse2-64.asm
+++ /dev/null
@@ -1,365 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-; JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
-
- global EXTN(jsimd_rgb_gray_convert_sse2)
-
-EXTN(jsimd_rgb_gray_convert_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov ecx, r10d
- test rcx,rcx
- jz near .return
-
- push rcx
-
- mov rsi, r12
- mov ecx, r13d
- mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
- lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-
- pop rcx
-
- mov rsi, r11
- mov eax, r14d
- test rax,rax
- jle near .return
-.rowloop:
- push rdi
- push rsi
- push rcx ; col
-
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr0
-
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
- push rax
- push rdx
- lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub rcx, byte SIZEOF_BYTE
- movzx rax, BYTE [rsi+rcx]
-.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub rcx, byte SIZEOF_WORD
- movzx rdx, WORD [rsi+rcx]
- shl rax, WORD_BIT
- or rax,rdx
-.column_ld4:
- movd xmmA,eax
- pop rdx
- pop rax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub rcx, byte SIZEOF_DWORD
- movd xmmF, XMM_DWORD [rsi+rcx]
- pslldq xmmA, SIZEOF_DWORD
- por xmmA,xmmF
-.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- sub rcx, byte SIZEOF_MMWORD
- movq xmmB, XMM_MMWORD [rsi+rcx]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmB
-.column_ld16:
- test cl, SIZEOF_XMMWORD
- jz short .column_ld32
- movdqa xmmF,xmmA
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- mov rcx, SIZEOF_XMMWORD
- jmp short .rgb_gray_cnv
-.column_ld32:
- test cl, 2*SIZEOF_XMMWORD
- mov rcx, SIZEOF_XMMWORD
- jz short .rgb_gray_cnv
- movdqa xmmB,xmmA
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- jmp short .rgb_gray_cnv
-
-.columnloop:
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
- ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- movdqa xmmG,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
- psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
- pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
- punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
- punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
- movdqa xmmD,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
- psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
- pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
- punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
- punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
- movdqa xmmE,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
- psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
- punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
- pxor xmmH,xmmH
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmB,xmmE
- punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
- movdqa xmmF,xmmD
- punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
- test cl, SIZEOF_XMMWORD/16
- jz short .column_ld2
- sub rcx, byte SIZEOF_XMMWORD/16
- movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
- test cl, SIZEOF_XMMWORD/8
- jz short .column_ld4
- sub rcx, byte SIZEOF_XMMWORD/8
- movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmE
-.column_ld4:
- test cl, SIZEOF_XMMWORD/4
- jz short .column_ld8
- sub rcx, byte SIZEOF_XMMWORD/4
- movdqa xmmE,xmmA
- movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
- test cl, SIZEOF_XMMWORD/2
- mov rcx, SIZEOF_XMMWORD
- jz short .rgb_gray_cnv
- movdqa xmmF,xmmA
- movdqa xmmH,xmmE
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- jmp short .rgb_gray_cnv
-
-.columnloop:
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
- movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
- ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
- punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
- movdqa xmmC,xmmF
- punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
- punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
- movdqa xmmB,xmmA
- punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
- punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
- movdqa xmmG,xmmD
- punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
- punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
- movdqa xmmE,xmmA
- punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
- movdqa xmmH,xmmB
- punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
- pxor xmmF,xmmF
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmD,xmmB
- punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
- movdqa xmmG,xmmE
- punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
- punpcklbw xmmF,xmmH
- punpckhbw xmmH,xmmH
- psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
- psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
- ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
- movdqa xmm6,xmm1
- punpcklwd xmm1,xmm3
- punpckhwd xmm6,xmm3
- pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movdqa xmm6,xmm0
- punpcklwd xmm0,xmm2
- punpckhwd xmm6,xmm2
- pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movdqa xmm0, xmm5 ; xmm0=BO
- movdqa xmm6, xmm4 ; xmm6=BE
-
- movdqa xmm4,xmm0
- punpcklwd xmm0,xmm3
- punpckhwd xmm4,xmm3
- pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
- movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
- paddd xmm0, xmm1
- paddd xmm4, xmm7
- paddd xmm0,xmm3
- paddd xmm4,xmm3
- psrld xmm0,SCALEBITS ; xmm0=YOL
- psrld xmm4,SCALEBITS ; xmm4=YOH
- packssdw xmm0,xmm4 ; xmm0=YO
-
- movdqa xmm4,xmm6
- punpcklwd xmm6,xmm2
- punpckhwd xmm4,xmm2
- pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
- movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
- paddd xmm6, XMMWORD [wk(0)]
- paddd xmm4, XMMWORD [wk(1)]
- paddd xmm6,xmm2
- paddd xmm4,xmm2
- psrld xmm6,SCALEBITS ; xmm6=YEL
- psrld xmm4,SCALEBITS ; xmm4=YEH
- packssdw xmm6,xmm4 ; xmm6=YE
-
- psllw xmm0,BYTE_BIT
- por xmm6,xmm0 ; xmm6=Y
- movdqa XMMWORD [rdi], xmm6 ; Save Y
-
- sub rcx, byte SIZEOF_XMMWORD
- add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
- add rdi, byte SIZEOF_XMMWORD ; outptr0
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test rcx,rcx
- jnz near .column_ld1
-
- pop rcx ; col
- pop rsi
- pop rdi
-
- add rsi, byte SIZEOF_JSAMPROW ; input_buf
- add rdi, byte SIZEOF_JSAMPROW
- dec rax ; num_rows
- jg near .rowloop
-
-.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jcgryext-sse2.asm b/media/libjpeg/simd/jcgryext-sse2.asm
deleted file mode 100644
index cd16dd1928..0000000000
--- a/media/libjpeg/simd/jcgryext-sse2.asm
+++ /dev/null
@@ -1,384 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-; JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION img_width
-%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
-%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
-%define output_row(b) (b)+20 ; JDIMENSION output_row
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
-
- global EXTN(jsimd_rgb_gray_convert_sse2)
-
-EXTN(jsimd_rgb_gray_convert_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [img_width(eax)]
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov esi, JSAMPIMAGE [output_buf(eax)]
- mov ecx, JDIMENSION [output_row(eax)]
- mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
- lea edi, [edi+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov esi, JSAMPARRAY [input_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- pushpic eax
- push edi
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr0
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- alignx 16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
- push eax
- push edx
- lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub ecx, byte SIZEOF_BYTE
- movzx eax, BYTE [esi+ecx]
-.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub ecx, byte SIZEOF_WORD
- movzx edx, WORD [esi+ecx]
- shl eax, WORD_BIT
- or eax,edx
-.column_ld4:
- movd xmmA,eax
- pop edx
- pop eax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub ecx, byte SIZEOF_DWORD
- movd xmmF, XMM_DWORD [esi+ecx]
- pslldq xmmA, SIZEOF_DWORD
- por xmmA,xmmF
-.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- sub ecx, byte SIZEOF_MMWORD
- movq xmmB, XMM_MMWORD [esi+ecx]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmB
-.column_ld16:
- test cl, SIZEOF_XMMWORD
- jz short .column_ld32
- movdqa xmmF,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- mov ecx, SIZEOF_XMMWORD
- jmp short .rgb_gray_cnv
-.column_ld32:
- test cl, 2*SIZEOF_XMMWORD
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_gray_cnv
- movdqa xmmB,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_gray_cnv
- alignx 16,7
-
-.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
- ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- movdqa xmmG,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
- psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
- pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
- punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
- punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
- movdqa xmmD,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
- psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
- pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
- punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
- punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
- movdqa xmmE,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
- psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
- punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
- pxor xmmH,xmmH
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmB,xmmE
- punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
- movdqa xmmF,xmmD
- punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
- test cl, SIZEOF_XMMWORD/16
- jz short .column_ld2
- sub ecx, byte SIZEOF_XMMWORD/16
- movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
- test cl, SIZEOF_XMMWORD/8
- jz short .column_ld4
- sub ecx, byte SIZEOF_XMMWORD/8
- movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmE
-.column_ld4:
- test cl, SIZEOF_XMMWORD/4
- jz short .column_ld8
- sub ecx, byte SIZEOF_XMMWORD/4
- movdqa xmmE,xmmA
- movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
- test cl, SIZEOF_XMMWORD/2
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_gray_cnv
- movdqa xmmF,xmmA
- movdqa xmmH,xmmE
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_gray_cnv
- alignx 16,7
-
-.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
- movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
- ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
- punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
- movdqa xmmC,xmmF
- punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
- punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
- movdqa xmmB,xmmA
- punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
- punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
- movdqa xmmG,xmmD
- punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
- punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
- movdqa xmmE,xmmA
- punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
- movdqa xmmH,xmmB
- punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
- pxor xmmF,xmmF
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmD,xmmB
- punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
- movdqa xmmG,xmmE
- punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
- punpcklbw xmmF,xmmH
- punpckhbw xmmH,xmmH
- psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
- psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
- ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
- movdqa xmm6,xmm1
- punpcklwd xmm1,xmm3
- punpckhwd xmm6,xmm3
- pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movdqa xmm6,xmm0
- punpcklwd xmm0,xmm2
- punpckhwd xmm6,xmm2
- pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movdqa xmm0, xmm5 ; xmm0=BO
- movdqa xmm6, xmm4 ; xmm6=BE
-
- movdqa xmm4,xmm0
- punpcklwd xmm0,xmm3
- punpckhwd xmm4,xmm3
- pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
- movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
- paddd xmm0, xmm1
- paddd xmm4, xmm7
- paddd xmm0,xmm3
- paddd xmm4,xmm3
- psrld xmm0,SCALEBITS ; xmm0=YOL
- psrld xmm4,SCALEBITS ; xmm4=YOH
- packssdw xmm0,xmm4 ; xmm0=YO
-
- movdqa xmm4,xmm6
- punpcklwd xmm6,xmm2
- punpckhwd xmm4,xmm2
- pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
- movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
- paddd xmm6, XMMWORD [wk(0)]
- paddd xmm4, XMMWORD [wk(1)]
- paddd xmm6,xmm2
- paddd xmm4,xmm2
- psrld xmm6,SCALEBITS ; xmm6=YEL
- psrld xmm4,SCALEBITS ; xmm4=YEH
- packssdw xmm6,xmm4 ; xmm6=YE
-
- psllw xmm0,BYTE_BIT
- por xmm6,xmm0 ; xmm6=Y
- movdqa XMMWORD [edi], xmm6 ; Save Y
-
- sub ecx, byte SIZEOF_XMMWORD
- add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
- add edi, byte SIZEOF_XMMWORD ; outptr0
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .column_ld1
-
- pop ecx ; col
- pop esi
- pop edi
- poppic eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_buf
- add edi, byte SIZEOF_JSAMPROW
- dec eax ; num_rows
- jg near .rowloop
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jchuff-sse2-64.asm b/media/libjpeg/simd/jchuff-sse2-64.asm
deleted file mode 100644
index b1144d1cdd..0000000000
--- a/media/libjpeg/simd/jchuff-sse2-64.asm
+++ /dev/null
@@ -1,360 +0,0 @@
-;
-; jchuff-sse2-64.asm - Huffman entropy encoding (64-bit SSE2)
-;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
-; Copyright (C) 2015, Matthieu Darbois.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_huff_encode_one_block)
-
-EXTN(jconst_huff_encode_one_block):
-
-%include "jpeg_nbits_table.inc"
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code. In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it. This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
- sub put_bits, 8 ; put_bits -= 8;
- mov rdx, put_buffer
- mov ecx, put_bits
- shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
- mov byte [buffer], dl ; *buffer++ = c;
- add buffer, 1
- cmp dl, 0xFF ; need to stuff a zero byte?
- jne %%.EMIT_BYTE_END
- mov byte [buffer], 0 ; *buffer++ = 0;
- add buffer, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
- add put_bits, ecx ; put_bits += size;
- shl put_buffer, cl ; put_buffer = (put_buffer << size);
- or put_buffer, %1
-%endmacro
-
-%macro CHECKBUF31 0
- cmp put_bits, 32 ; if (put_bits > 31) {
- jl %%.CHECKBUF31_END
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
-%%.CHECKBUF31_END:
-%endmacro
-
-%macro CHECKBUF47 0
- cmp put_bits, 48 ; if (put_bits > 47) {
- jl %%.CHECKBUF47_END
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
-%%.CHECKBUF47_END:
-%endmacro
-
-%macro EMIT_BITS 2
- CHECKBUF47
- mov ecx, %2
- PUT_BITS %1
-%endmacro
-
-%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
- pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
- pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
- pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
- pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
- pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
- pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
- pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
- pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
- pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
- pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
- pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
- pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
- pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
- pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
- pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
- pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
- pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
- pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
- pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
- pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
- pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
- pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
- pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
- pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
- pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
- pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
- pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
- pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
- pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
- pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
- pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
- pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
- pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
- pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
- pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
- pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
-%else
- pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
-%endif
- pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
- paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
- paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
- paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
- paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
- pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
- pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
- pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
- pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
- pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
- movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
- movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
- movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
- movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
- movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
- movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
- movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
- movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
-%endmacro
-
-;
-; Encode a single block's worth of coefficients.
-;
-; GLOBAL(JOCTET*)
-; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
-; JCOEFPTR block, int last_dc_val,
-; c_derived_tbl *dctbl, c_derived_tbl *actbl)
-;
-
-; r10 = working_state *state
-; r11 = JOCTET *buffer
-; r12 = JCOEFPTR block
-; r13 = int last_dc_val
-; r14 = c_derived_tbl *dctbl
-; r15 = c_derived_tbl *actbl
-
-%define t1 rbp-(DCTSIZE2*SIZEOF_WORD)
-%define t2 t1-(DCTSIZE2*SIZEOF_WORD)
-%define put_buffer r8
-%define put_bits r9d
-%define buffer rax
-
- align 16
- global EXTN(jsimd_huff_encode_one_block_sse2)
-
-EXTN(jsimd_huff_encode_one_block_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [t2]
- collect_args
-%ifdef WIN64
- movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
- movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
- movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
- movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
- sub rsp, 4*SIZEOF_XMMWORD
-%endif
- push rbx
-
- mov buffer, r11 ; r11 is now sratch
-
- mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
- mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits;
- push r10 ; r10 is now scratch
-
- ; Encode the DC coefficient difference per section F.1.2.1
- movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
- sub edi, r13d ; r13 is not used anymore
- mov ebx, edi
-
- ; This is a well-known technique for obtaining the absolute value
- ; without a branch. It is derived from an assembly language technique
- ; presented in "How to Optimize for the Pentium Processors",
- ; Copyright (c) 1996, 1997 by Agner Fog.
- mov esi, edi
- sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
- xor edi, esi ; temp ^= temp3;
- sub edi, esi ; temp -= temp3;
-
- ; For a negative input, want temp2 = bitwise complement of abs(input)
- ; This code assumes we are on a two's complement machine
- add ebx, esi ; temp2 += temp3;
-
- ; Find the number of bits needed for the magnitude of the coefficient
- lea r11, [rel jpeg_nbits_table]
- movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
- ; Emit the Huffman-coded symbol for the number of bits
- mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
- movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
- EMIT_BITS r11, esi ; EMIT_BITS(code, size)
-
- ; Mask off any extra bits in code
- mov esi, 1
- mov ecx, edi
- shl esi, cl
- dec esi
- and ebx, esi ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
- ; Emit that number of bits of the value, if positive,
- ; or the complement of its magnitude, if negative.
- EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
-
- ; Prepare data
- xor ebx, ebx
- kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
- 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
- 27, 20, 13, 6, 7, 14, 21, 28, 35, \
- xmm0, xmm1, xmm2, xmm3
- kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
- 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
- 53, 60, 61, 54, 47, 55, 62, 63, 63, \
- xmm4, xmm5, xmm6, xmm7
-
- pxor xmm8, xmm8
- pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
- pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
- pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
- pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
- pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
- pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
- pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
- pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
- packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
- packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
- packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
- packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
- pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
- pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
- pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
- pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
- shl r12, 16
- shl r14, 16
- or r11, r12
- or r13, r14
- shl r13, 32
- or r11, r13
- not r11 ; index = ~index;
-
- ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
- ;jmp .EFN
-
- mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
- movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
- lea rsi, [t1]
-.BLOOP:
- bsf r12, r11 ; r = __builtin_ctzl(index);
- jz .ELOOP
- mov rcx, r12
- lea rsi, [rsi+r12*2] ; k += r;
- shr r11, cl ; index >>= r;
- movzx rdi, word [rsi] ; temp = t1[k];
- lea rbx, [rel jpeg_nbits_table]
- movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
-.BRLOOP:
- cmp r12, 16 ; while (r > 15) {
- jl .ERLOOP
- EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
- sub r12, 16 ; r -= 16;
- jmp .BRLOOP
-.ERLOOP:
- ; Emit Huffman symbol for run length / number of bits
- CHECKBUF31 ; uses rcx, rdx
-
- shl r12, 4 ; temp3 = (r << 4) + nbits;
- add r12, rdi
- mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
- movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
- PUT_BITS rbx
-
- ;EMIT_CODE(code, size)
-
- movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
- ; Mask off any extra bits in code
- mov rcx, rdi
- mov rdx, 1
- shl rdx, cl
- dec rdx
- and rbx, rdx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
- PUT_BITS rbx ; PUT_BITS(temp2, nbits)
-
- shr r11, 1 ; index >>= 1;
- add rsi, 2 ; ++k;
- jmp .BLOOP
-.ELOOP:
- ; If the last coef(s) were zero, emit an end-of-block code
- lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
- cmp rdi, rsi ; if (r > 0) {
- je .EFN
- mov ebx, INT [r15] ; code = actbl->ehufco[0];
- movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
- EMIT_BITS rbx, r12d
-.EFN:
- pop r10
- ; Save put_buffer & put_bits
- mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
- mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
-
- pop rbx
-%ifdef WIN64
- movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
- movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
- movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
- movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
- add rsp, 4*SIZEOF_XMMWORD
-%endif
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jchuff-sse2.asm b/media/libjpeg/simd/jchuff-sse2.asm
deleted file mode 100644
index 36d1f2db66..0000000000
--- a/media/libjpeg/simd/jchuff-sse2.asm
+++ /dev/null
@@ -1,426 +0,0 @@
-;
-; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
-;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
-; Copyright (C) 2015, Matthieu Darbois.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_huff_encode_one_block)
-
-EXTN(jconst_huff_encode_one_block):
-
-%include "jpeg_nbits_table.inc"
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code. In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it. This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
- sub put_bits, 8 ; put_bits -= 8;
- mov edx, put_buffer
- mov ecx, put_bits
- shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
- mov byte [eax], dl ; *buffer++ = c;
- add eax, 1
- cmp dl, 0xFF ; need to stuff a zero byte?
- jne %%.EMIT_BYTE_END
- mov byte [eax], 0 ; *buffer++ = 0;
- add eax, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
- add put_bits, ecx ; put_bits += size;
- shl put_buffer, cl ; put_buffer = (put_buffer << size);
- or put_buffer, %1
-%endmacro
-
-%macro CHECKBUF15 0
- cmp put_bits, 16 ; if (put_bits > 31) {
- jl %%.CHECKBUF15_END
- mov eax, POINTER [esp+buffer]
- EMIT_BYTE
- EMIT_BYTE
- mov POINTER [esp+buffer], eax
-%%.CHECKBUF15_END:
-%endmacro
-
-%macro EMIT_BITS 1
- PUT_BITS %1
- CHECKBUF15
-%endmacro
-
-%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
- pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128();
- pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128();
- pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128();
- pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128();
- pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
- pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
- pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
- pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
- pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
- pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
- pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
- pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
- pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
- pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
- pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
- pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
- pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
- pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
- pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
- pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
- pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
- pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
- pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
- pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
- pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
- pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
- pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
- pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
- pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
- pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
- pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
- pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
- pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
- pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
- pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
- pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
-%else
- pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31];
-%endif
- pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
- paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg);
- paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg);
- paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg);
- paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg);
- pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg);
- pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg);
- pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg);
- pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg);
- pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1);
- movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
- movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
- movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
- movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
- movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
- movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
- movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
- movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
-%endmacro
-
-;
-; Encode a single block's worth of coefficients.
-;
-; GLOBAL(JOCTET*)
-; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
-; JCOEFPTR block, int last_dc_val,
-; c_derived_tbl *dctbl, c_derived_tbl *actbl)
-;
-
-; eax + 8 = working_state *state
-; eax + 12 = JOCTET *buffer
-; eax + 16 = JCOEFPTR block
-; eax + 20 = int last_dc_val
-; eax + 24 = c_derived_tbl *dctbl
-; eax + 28 = c_derived_tbl *actbl
-
-%define pad 6*SIZEOF_DWORD ; Align to 16 bytes
-%define t1 pad
-%define t2 t1+(DCTSIZE2*SIZEOF_WORD)
-%define block t2+(DCTSIZE2*SIZEOF_WORD)
-%define actbl block+SIZEOF_DWORD
-%define buffer actbl+SIZEOF_DWORD
-%define temp buffer+SIZEOF_DWORD
-%define temp2 temp+SIZEOF_DWORD
-%define temp3 temp2+SIZEOF_DWORD
-%define temp4 temp3+SIZEOF_DWORD
-%define temp5 temp4+SIZEOF_DWORD
-%define gotptr temp5+SIZEOF_DWORD ; void *gotptr
-%define put_buffer ebx
-%define put_bits edi
-
- align 16
- global EXTN(jsimd_huff_encode_one_block_sse2)
-
-EXTN(jsimd_huff_encode_one_block_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- sub esp, temp5+9*SIZEOF_DWORD-pad
- push ebx
- push ecx
-; push edx ; need not be preserved
- push esi
- push edi
- push ebp
-
- mov esi, POINTER [eax+8] ; (working_state *state)
- mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer;
- mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits;
- push esi ; esi is now scratch
-
- get_GOT edx ; get GOT address
- movpic POINTER [esp+gotptr], edx ; save GOT address
-
- mov ecx, POINTER [eax+28]
- mov edx, POINTER [eax+16]
- mov esi, POINTER [eax+12]
- mov POINTER [esp+actbl], ecx
- mov POINTER [esp+block], edx
- mov POINTER [esp+buffer], esi
-
- ; Encode the DC coefficient difference per section F.1.2.1
- mov esi, POINTER [esp+block] ; block
- movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
- sub ecx, DWORD [eax+20]
- mov esi, ecx
-
- ; This is a well-known technique for obtaining the absolute value
- ; without a branch. It is derived from an assembly language technique
- ; presented in "How to Optimize for the Pentium Processors",
- ; Copyright (c) 1996, 1997 by Agner Fog.
- mov edx, ecx
- sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
- xor ecx, edx ; temp ^= temp3;
- sub ecx, edx ; temp -= temp3;
-
- ; For a negative input, want temp2 = bitwise complement of abs(input)
- ; This code assumes we are on a two's complement machine
- add esi, edx ; temp2 += temp3;
- mov DWORD [esp+temp], esi ; backup temp2 in temp
-
- ; Find the number of bits needed for the magnitude of the coefficient
- movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
- movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp);
- mov DWORD [esp+temp2], edx ; backup nbits in temp2
-
- ; Emit the Huffman-coded symbol for the number of bits
- mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
- mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits];
- movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
- EMIT_BITS eax ; EMIT_BITS(code, size)
-
- mov ecx, DWORD [esp+temp2] ; restore nbits
-
- ; Mask off any extra bits in code
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
- ; Emit that number of bits of the value, if positive,
- ; or the complement of its magnitude, if negative.
- EMIT_BITS eax ; EMIT_BITS(temp2, nbits)
-
- ; Prepare data
- xor ecx, ecx
- mov esi, POINTER [esp+block]
- kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
- 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
- 27, 20, 13, 6, 7, 14, 21, 28, 35, \
- xmm0, xmm1, xmm2, xmm3
- kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
- 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
- 53, 60, 61, 54, 47, 55, 62, 63, 63, \
- xmm0, xmm1, xmm2, xmm3
-
- pxor xmm7, xmm7
- movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
- movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
- movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
- movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
- pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
- pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
- pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
- pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
- packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
- packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
- pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
- pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
- shl ecx, 16
- or edx, ecx
- not edx ; index = ~index;
-
- lea esi, [esp+t1]
- mov ebp, POINTER [esp+actbl] ; ebp = actbl
-
-.BLOOP:
- bsf ecx, edx ; r = __builtin_ctzl(index);
- jz .ELOOP
- lea esi, [esi+ecx*2] ; k += r;
- shr edx, cl ; index >>= r;
- mov DWORD [esp+temp3], edx
-.BRLOOP:
- cmp ecx, 16 ; while (r > 15) {
- jl .ERLOOP
- sub ecx, 16 ; r -= 16;
- mov DWORD [esp+temp], ecx
- mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
- movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
- EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
- mov ecx, DWORD [esp+temp]
- jmp .BRLOOP
-.ERLOOP:
- movsx eax, word [esi] ; temp = t1[k];
- movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
- movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp);
- mov DWORD [esp+temp2], eax
- ; Emit Huffman symbol for run length / number of bits
- shl ecx, 4 ; temp3 = (r << 4) + nbits;
- add ecx, eax
- mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
- movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
- EMIT_BITS eax
-
- movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
- ; Mask off any extra bits in code
- mov ecx, DWORD [esp+temp2]
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
- EMIT_BITS eax ; PUT_BITS(temp2, nbits)
- mov edx, DWORD [esp+temp3]
- add esi, 2 ; ++k;
- shr edx, 1 ; index >>= 1;
-
- jmp .BLOOP
-.ELOOP:
- movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
- movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
- movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
- movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
- pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
- pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
- pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
- pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
- packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
- packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
- pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
- pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
- shl ecx, 16
- or edx, ecx
- not edx ; index = ~index;
-
- lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
- sub eax, esi
- shr eax, 1
- bsf ecx, edx ; r = __builtin_ctzl(index);
- jz .ELOOP2
- shr edx, cl ; index >>= r;
- add ecx, eax
- lea esi, [esi+ecx*2] ; k += r;
- mov DWORD [esp+temp3], edx
- jmp .BRLOOP2
-.BLOOP2:
- bsf ecx, edx ; r = __builtin_ctzl(index);
- jz .ELOOP2
- lea esi, [esi+ecx*2] ; k += r;
- shr edx, cl ; index >>= r;
- mov DWORD [esp+temp3], edx
-.BRLOOP2:
- cmp ecx, 16 ; while (r > 15) {
- jl .ERLOOP2
- sub ecx, 16 ; r -= 16;
- mov DWORD [esp+temp], ecx
- mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
- movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
- EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
- mov ecx, DWORD [esp+temp]
- jmp .BRLOOP2
-.ERLOOP2:
- movsx eax, word [esi] ; temp = t1[k];
- bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
- inc eax
- mov DWORD [esp+temp2], eax
- ; Emit Huffman symbol for run length / number of bits
- shl ecx, 4 ; temp3 = (r << 4) + nbits;
- add ecx, eax
- mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
- movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
- EMIT_BITS eax
-
- movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
- ; Mask off any extra bits in code
- mov ecx, DWORD [esp+temp2]
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
- EMIT_BITS eax ; PUT_BITS(temp2, nbits)
- mov edx, DWORD [esp+temp3]
- add esi, 2 ; ++k;
- shr edx, 1 ; index >>= 1;
-
- jmp .BLOOP2
-.ELOOP2:
- ; If the last coef(s) were zero, emit an end-of-block code
- lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
- cmp edx, esi ; if (r > 0) {
- je .EFN
- mov eax, INT [ebp] ; code = actbl->ehufco[0];
- movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0];
- EMIT_BITS eax
-.EFN:
- mov eax, [esp+buffer]
- pop esi
- ; Save put_buffer & put_bits
- mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
- mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits;
-
- pop ebp
- pop edi
- pop esi
-; pop edx ; need not be preserved
- pop ecx
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jcolsamp.inc b/media/libjpeg/simd/jcolsamp.inc
deleted file mode 100644
index 3be446e847..0000000000
--- a/media/libjpeg/simd/jcolsamp.inc
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-; jcolsamp.inc - private declarations for color conversion & up/downsampling
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
-
-; --------------------------------------------------------------------------
-
-; pseudo-resisters to make ordering of RGB configurable
-;
-%if RGB_RED == 0
-%define mmA mm0
-%define mmB mm1
-%define xmmA xmm0
-%define xmmB xmm1
-%elif RGB_GREEN == 0
-%define mmA mm2
-%define mmB mm3
-%define xmmA xmm2
-%define xmmB xmm3
-%elif RGB_BLUE == 0
-%define mmA mm4
-%define mmB mm5
-%define xmmA xmm4
-%define xmmB xmm5
-%else
-%define mmA mm6
-%define mmB mm7
-%define xmmA xmm6
-%define xmmB xmm7
-%endif
-
-%if RGB_RED == 1
-%define mmC mm0
-%define mmD mm1
-%define xmmC xmm0
-%define xmmD xmm1
-%elif RGB_GREEN == 1
-%define mmC mm2
-%define mmD mm3
-%define xmmC xmm2
-%define xmmD xmm3
-%elif RGB_BLUE == 1
-%define mmC mm4
-%define mmD mm5
-%define xmmC xmm4
-%define xmmD xmm5
-%else
-%define mmC mm6
-%define mmD mm7
-%define xmmC xmm6
-%define xmmD xmm7
-%endif
-
-%if RGB_RED == 2
-%define mmE mm0
-%define mmF mm1
-%define xmmE xmm0
-%define xmmF xmm1
-%elif RGB_GREEN == 2
-%define mmE mm2
-%define mmF mm3
-%define xmmE xmm2
-%define xmmF xmm3
-%elif RGB_BLUE == 2
-%define mmE mm4
-%define mmF mm5
-%define xmmE xmm4
-%define xmmF xmm5
-%else
-%define mmE mm6
-%define mmF mm7
-%define xmmE xmm6
-%define xmmF xmm7
-%endif
-
-%if RGB_RED == 3
-%define mmG mm0
-%define mmH mm1
-%define xmmG xmm0
-%define xmmH xmm1
-%elif RGB_GREEN == 3
-%define mmG mm2
-%define mmH mm3
-%define xmmG xmm2
-%define xmmH xmm3
-%elif RGB_BLUE == 3
-%define mmG mm4
-%define mmH mm5
-%define xmmG xmm4
-%define xmmH xmm5
-%else
-%define mmG mm6
-%define mmH mm7
-%define xmmG xmm6
-%define xmmH xmm7
-%endif
-
-; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jcsample-mmx.asm b/media/libjpeg/simd/jcsample-mmx.asm
deleted file mode 100644
index 6cd544e74d..0000000000
--- a/media/libjpeg/simd/jcsample-mmx.asm
+++ /dev/null
@@ -1,323 +0,0 @@
-;
-; jcsample.asm - downsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-; JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION image_width
-%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
-%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
-%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
-%define input_data(b) (b)+24 ; JSAMPARRAY input_data
-%define output_data(b) (b)+28 ; JSAMPARRAY output_data
-
- align 16
- global EXTN(jsimd_h2v1_downsample_mmx)
-
-EXTN(jsimd_h2v1_downsample_mmx):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov ecx, JDIMENSION [width_blks(ebp)]
- shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
- jz near .return
-
- mov edx, JDIMENSION [img_width(ebp)]
-
- ; -- expand_right_edge
-
- push ecx
- shl ecx,1 ; output_cols * 2
- sub ecx,edx
- jle short .expand_end
-
- mov eax, INT [max_v_samp(ebp)]
- test eax,eax
- jle short .expand_end
-
- cld
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- alignx 16,7
-.expandloop:
- push eax
- push ecx
-
- mov edi, JSAMPROW [esi]
- add edi,edx
- mov al, JSAMPLE [edi-1]
-
- rep stosb
-
- pop ecx
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW
- dec eax
- jg short .expandloop
-
-.expand_end:
- pop ecx ; output_cols
-
- ; -- h2v1_downsample
-
- mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
- test eax,eax
- jle near .return
-
- mov edx, 0x00010000 ; bias pattern
- movd mm7,edx
- pcmpeqw mm6,mm6
- punpckldq mm7,mm7 ; mm7={0, 1, 0, 1}
- psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
- alignx 16,7
-.rowloop:
- push ecx
- push edi
- push esi
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr
- alignx 16,7
-.columnloop:
-
- movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
- movq mm2,mm0
- movq mm3,mm1
-
- pand mm0,mm6
- psrlw mm2,BYTE_BIT
- pand mm1,mm6
- psrlw mm3,BYTE_BIT
-
- paddw mm0,mm2
- paddw mm1,mm3
- paddw mm0,mm7
- paddw mm1,mm7
- psrlw mm0,1
- psrlw mm1,1
-
- packuswb mm0,mm1
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
- add esi, byte 2*SIZEOF_MMWORD ; inptr
- add edi, byte 1*SIZEOF_MMWORD ; outptr
- sub ecx, byte SIZEOF_MMWORD ; outcol
- jnz short .columnloop
-
- pop esi
- pop edi
- pop ecx
-
- add esi, byte SIZEOF_JSAMPROW ; input_data
- add edi, byte SIZEOF_JSAMPROW ; output_data
- dec eax ; rowctr
- jg short .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
-; pop ebx ; unused
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-; JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION image_width
-%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
-%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
-%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
-%define input_data(b) (b)+24 ; JSAMPARRAY input_data
-%define output_data(b) (b)+28 ; JSAMPARRAY output_data
-
- align 16
- global EXTN(jsimd_h2v2_downsample_mmx)
-
-EXTN(jsimd_h2v2_downsample_mmx):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov ecx, JDIMENSION [width_blks(ebp)]
- shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
- jz near .return
-
- mov edx, JDIMENSION [img_width(ebp)]
-
- ; -- expand_right_edge
-
- push ecx
- shl ecx,1 ; output_cols * 2
- sub ecx,edx
- jle short .expand_end
-
- mov eax, INT [max_v_samp(ebp)]
- test eax,eax
- jle short .expand_end
-
- cld
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- alignx 16,7
-.expandloop:
- push eax
- push ecx
-
- mov edi, JSAMPROW [esi]
- add edi,edx
- mov al, JSAMPLE [edi-1]
-
- rep stosb
-
- pop ecx
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW
- dec eax
- jg short .expandloop
-
-.expand_end:
- pop ecx ; output_cols
-
- ; -- h2v2_downsample
-
- mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
- test eax,eax
- jle near .return
-
- mov edx, 0x00020001 ; bias pattern
- movd mm7,edx
- pcmpeqw mm6,mm6
- punpckldq mm7,mm7 ; mm7={1, 2, 1, 2}
- psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
- alignx 16,7
-.rowloop:
- push ecx
- push edi
- push esi
-
- mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
- mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
- mov edi, JSAMPROW [edi] ; outptr
- alignx 16,7
-.columnloop:
-
- movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
- movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
- movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
-
- movq mm4,mm0
- movq mm5,mm1
- pand mm0,mm6
- psrlw mm4,BYTE_BIT
- pand mm1,mm6
- psrlw mm5,BYTE_BIT
- paddw mm0,mm4
- paddw mm1,mm5
-
- movq mm4,mm2
- movq mm5,mm3
- pand mm2,mm6
- psrlw mm4,BYTE_BIT
- pand mm3,mm6
- psrlw mm5,BYTE_BIT
- paddw mm2,mm4
- paddw mm3,mm5
-
- paddw mm0,mm1
- paddw mm2,mm3
- paddw mm0,mm7
- paddw mm2,mm7
- psrlw mm0,2
- psrlw mm2,2
-
- packuswb mm0,mm2
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
- add edx, byte 2*SIZEOF_MMWORD ; inptr0
- add esi, byte 2*SIZEOF_MMWORD ; inptr1
- add edi, byte 1*SIZEOF_MMWORD ; outptr
- sub ecx, byte SIZEOF_MMWORD ; outcol
- jnz near .columnloop
-
- pop esi
- pop edi
- pop ecx
-
- add esi, byte 2*SIZEOF_JSAMPROW ; input_data
- add edi, byte 1*SIZEOF_JSAMPROW ; output_data
- dec eax ; rowctr
- jg near .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
-; pop ebx ; unused
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jcsample-sse2-64.asm b/media/libjpeg/simd/jcsample-sse2-64.asm
deleted file mode 100644
index 40ee15fcbb..0000000000
--- a/media/libjpeg/simd/jcsample-sse2-64.asm
+++ /dev/null
@@ -1,329 +0,0 @@
-;
-; jcsample.asm - downsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-; JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
- align 16
- global EXTN(jsimd_h2v1_downsample_sse2)
-
-EXTN(jsimd_h2v1_downsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov ecx, r13d
- shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
- jz near .return
-
- mov edx, r10d
-
- ; -- expand_right_edge
-
- push rcx
- shl rcx,1 ; output_cols * 2
- sub rcx,rdx
- jle short .expand_end
-
- mov rax, r11
- test rax,rax
- jle short .expand_end
-
- cld
- mov rsi, r14 ; input_data
-.expandloop:
- push rax
- push rcx
-
- mov rdi, JSAMPROW [rsi]
- add rdi,rdx
- mov al, JSAMPLE [rdi-1]
-
- rep stosb
-
- pop rcx
- pop rax
-
- add rsi, byte SIZEOF_JSAMPROW
- dec rax
- jg short .expandloop
-
-.expand_end:
- pop rcx ; output_cols
-
- ; -- h2v1_downsample
-
- mov eax, r12d ; rowctr
- test eax,eax
- jle near .return
-
- mov rdx, 0x00010000 ; bias pattern
- movd xmm7,edx
- pcmpeqw xmm6,xmm6
- pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
- mov rsi, r14 ; input_data
- mov rdi, r15 ; output_data
-.rowloop:
- push rcx
- push rdi
- push rsi
-
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
-
- cmp rcx, byte SIZEOF_XMMWORD
- jae short .columnloop
-
-.columnloop_r8:
- movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- pxor xmm1,xmm1
- mov rcx, SIZEOF_XMMWORD
- jmp short .downsample
-
-.columnloop:
- movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
- movdqa xmm2,xmm0
- movdqa xmm3,xmm1
-
- pand xmm0,xmm6
- psrlw xmm2,BYTE_BIT
- pand xmm1,xmm6
- psrlw xmm3,BYTE_BIT
-
- paddw xmm0,xmm2
- paddw xmm1,xmm3
- paddw xmm0,xmm7
- paddw xmm1,xmm7
- psrlw xmm0,1
- psrlw xmm1,1
-
- packuswb xmm0,xmm1
-
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
- sub rcx, byte SIZEOF_XMMWORD ; outcol
- add rsi, byte 2*SIZEOF_XMMWORD ; inptr
- add rdi, byte 1*SIZEOF_XMMWORD ; outptr
- cmp rcx, byte SIZEOF_XMMWORD
- jae short .columnloop
- test rcx,rcx
- jnz short .columnloop_r8
-
- pop rsi
- pop rdi
- pop rcx
-
- add rsi, byte SIZEOF_JSAMPROW ; input_data
- add rdi, byte SIZEOF_JSAMPROW ; output_data
- dec rax ; rowctr
- jg near .rowloop
-
-.return:
- uncollect_args
- pop rbp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-; JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
- align 16
- global EXTN(jsimd_h2v2_downsample_sse2)
-
-EXTN(jsimd_h2v2_downsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov ecx, r13d
- shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
- jz near .return
-
- mov edx, r10d
-
- ; -- expand_right_edge
-
- push rcx
- shl rcx,1 ; output_cols * 2
- sub rcx,rdx
- jle short .expand_end
-
- mov rax, r11
- test rax,rax
- jle short .expand_end
-
- cld
- mov rsi, r14 ; input_data
-.expandloop:
- push rax
- push rcx
-
- mov rdi, JSAMPROW [rsi]
- add rdi,rdx
- mov al, JSAMPLE [rdi-1]
-
- rep stosb
-
- pop rcx
- pop rax
-
- add rsi, byte SIZEOF_JSAMPROW
- dec rax
- jg short .expandloop
-
-.expand_end:
- pop rcx ; output_cols
-
- ; -- h2v2_downsample
-
- mov eax, r12d ; rowctr
- test rax,rax
- jle near .return
-
- mov rdx, 0x00020001 ; bias pattern
- movd xmm7,edx
- pcmpeqw xmm6,xmm6
- pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
- mov rsi, r14 ; input_data
- mov rdi, r15 ; output_data
-.rowloop:
- push rcx
- push rdi
- push rsi
-
- mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
- mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
- mov rdi, JSAMPROW [rdi] ; outptr
-
- cmp rcx, byte SIZEOF_XMMWORD
- jae short .columnloop
-
-.columnloop_r8:
- movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- pxor xmm2,xmm2
- pxor xmm3,xmm3
- mov rcx, SIZEOF_XMMWORD
- jmp short .downsample
-
-.columnloop:
- movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
- movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
- movdqa xmm4,xmm0
- movdqa xmm5,xmm1
- pand xmm0,xmm6
- psrlw xmm4,BYTE_BIT
- pand xmm1,xmm6
- psrlw xmm5,BYTE_BIT
- paddw xmm0,xmm4
- paddw xmm1,xmm5
-
- movdqa xmm4,xmm2
- movdqa xmm5,xmm3
- pand xmm2,xmm6
- psrlw xmm4,BYTE_BIT
- pand xmm3,xmm6
- psrlw xmm5,BYTE_BIT
- paddw xmm2,xmm4
- paddw xmm3,xmm5
-
- paddw xmm0,xmm1
- paddw xmm2,xmm3
- paddw xmm0,xmm7
- paddw xmm2,xmm7
- psrlw xmm0,2
- psrlw xmm2,2
-
- packuswb xmm0,xmm2
-
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
- sub rcx, byte SIZEOF_XMMWORD ; outcol
- add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
- add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
- add rdi, byte 1*SIZEOF_XMMWORD ; outptr
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test rcx,rcx
- jnz near .columnloop_r8
-
- pop rsi
- pop rdi
- pop rcx
-
- add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
- add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
- dec rax ; rowctr
- jg near .rowloop
-
-.return:
- uncollect_args
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jcsample-sse2.asm b/media/libjpeg/simd/jcsample-sse2.asm
deleted file mode 100644
index 83c9d152a7..0000000000
--- a/media/libjpeg/simd/jcsample-sse2.asm
+++ /dev/null
@@ -1,350 +0,0 @@
-;
-; jcsample.asm - downsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-; JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION image_width
-%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
-%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
-%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
-%define input_data(b) (b)+24 ; JSAMPARRAY input_data
-%define output_data(b) (b)+28 ; JSAMPARRAY output_data
-
- align 16
- global EXTN(jsimd_h2v1_downsample_sse2)
-
-EXTN(jsimd_h2v1_downsample_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov ecx, JDIMENSION [width_blks(ebp)]
- shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
- jz near .return
-
- mov edx, JDIMENSION [img_width(ebp)]
-
- ; -- expand_right_edge
-
- push ecx
- shl ecx,1 ; output_cols * 2
- sub ecx,edx
- jle short .expand_end
-
- mov eax, INT [max_v_samp(ebp)]
- test eax,eax
- jle short .expand_end
-
- cld
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- alignx 16,7
-.expandloop:
- push eax
- push ecx
-
- mov edi, JSAMPROW [esi]
- add edi,edx
- mov al, JSAMPLE [edi-1]
-
- rep stosb
-
- pop ecx
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW
- dec eax
- jg short .expandloop
-
-.expand_end:
- pop ecx ; output_cols
-
- ; -- h2v1_downsample
-
- mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
- test eax,eax
- jle near .return
-
- mov edx, 0x00010000 ; bias pattern
- movd xmm7,edx
- pcmpeqw xmm6,xmm6
- pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
- alignx 16,7
-.rowloop:
- push ecx
- push edi
- push esi
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr
-
- cmp ecx, byte SIZEOF_XMMWORD
- jae short .columnloop
- alignx 16,7
-
-.columnloop_r8:
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
- pxor xmm1,xmm1
- mov ecx, SIZEOF_XMMWORD
- jmp short .downsample
- alignx 16,7
-
-.columnloop:
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
- movdqa xmm2,xmm0
- movdqa xmm3,xmm1
-
- pand xmm0,xmm6
- psrlw xmm2,BYTE_BIT
- pand xmm1,xmm6
- psrlw xmm3,BYTE_BIT
-
- paddw xmm0,xmm2
- paddw xmm1,xmm3
- paddw xmm0,xmm7
- paddw xmm1,xmm7
- psrlw xmm0,1
- psrlw xmm1,1
-
- packuswb xmm0,xmm1
-
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
- sub ecx, byte SIZEOF_XMMWORD ; outcol
- add esi, byte 2*SIZEOF_XMMWORD ; inptr
- add edi, byte 1*SIZEOF_XMMWORD ; outptr
- cmp ecx, byte SIZEOF_XMMWORD
- jae short .columnloop
- test ecx,ecx
- jnz short .columnloop_r8
-
- pop esi
- pop edi
- pop ecx
-
- add esi, byte SIZEOF_JSAMPROW ; input_data
- add edi, byte SIZEOF_JSAMPROW ; output_data
- dec eax ; rowctr
- jg near .rowloop
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
-; pop ebx ; unused
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-; JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION image_width
-%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
-%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
-%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
-%define input_data(b) (b)+24 ; JSAMPARRAY input_data
-%define output_data(b) (b)+28 ; JSAMPARRAY output_data
-
- align 16
- global EXTN(jsimd_h2v2_downsample_sse2)
-
-EXTN(jsimd_h2v2_downsample_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov ecx, JDIMENSION [width_blks(ebp)]
- shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
- jz near .return
-
- mov edx, JDIMENSION [img_width(ebp)]
-
- ; -- expand_right_edge
-
- push ecx
- shl ecx,1 ; output_cols * 2
- sub ecx,edx
- jle short .expand_end
-
- mov eax, INT [max_v_samp(ebp)]
- test eax,eax
- jle short .expand_end
-
- cld
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- alignx 16,7
-.expandloop:
- push eax
- push ecx
-
- mov edi, JSAMPROW [esi]
- add edi,edx
- mov al, JSAMPLE [edi-1]
-
- rep stosb
-
- pop ecx
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW
- dec eax
- jg short .expandloop
-
-.expand_end:
- pop ecx ; output_cols
-
- ; -- h2v2_downsample
-
- mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
- test eax,eax
- jle near .return
-
- mov edx, 0x00020001 ; bias pattern
- movd xmm7,edx
- pcmpeqw xmm6,xmm6
- pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
- alignx 16,7
-.rowloop:
- push ecx
- push edi
- push esi
-
- mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
- mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
- mov edi, JSAMPROW [edi] ; outptr
-
- cmp ecx, byte SIZEOF_XMMWORD
- jae short .columnloop
- alignx 16,7
-
-.columnloop_r8:
- movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
- pxor xmm2,xmm2
- pxor xmm3,xmm3
- mov ecx, SIZEOF_XMMWORD
- jmp short .downsample
- alignx 16,7
-
-.columnloop:
- movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
- movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
- movdqa xmm4,xmm0
- movdqa xmm5,xmm1
- pand xmm0,xmm6
- psrlw xmm4,BYTE_BIT
- pand xmm1,xmm6
- psrlw xmm5,BYTE_BIT
- paddw xmm0,xmm4
- paddw xmm1,xmm5
-
- movdqa xmm4,xmm2
- movdqa xmm5,xmm3
- pand xmm2,xmm6
- psrlw xmm4,BYTE_BIT
- pand xmm3,xmm6
- psrlw xmm5,BYTE_BIT
- paddw xmm2,xmm4
- paddw xmm3,xmm5
-
- paddw xmm0,xmm1
- paddw xmm2,xmm3
- paddw xmm0,xmm7
- paddw xmm2,xmm7
- psrlw xmm0,2
- psrlw xmm2,2
-
- packuswb xmm0,xmm2
-
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
- sub ecx, byte SIZEOF_XMMWORD ; outcol
- add edx, byte 2*SIZEOF_XMMWORD ; inptr0
- add esi, byte 2*SIZEOF_XMMWORD ; inptr1
- add edi, byte 1*SIZEOF_XMMWORD ; outptr
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .columnloop_r8
-
- pop esi
- pop edi
- pop ecx
-
- add esi, byte 2*SIZEOF_JSAMPROW ; input_data
- add edi, byte 1*SIZEOF_JSAMPROW ; output_data
- dec eax ; rowctr
- jg near .rowloop
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
-; pop ebx ; unused
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jdcolext-mmx.asm b/media/libjpeg/simd/jdcolext-mmx.asm
deleted file mode 100644
index 21e34f6786..0000000000
--- a/media/libjpeg/simd/jdcolext-mmx.asm
+++ /dev/null
@@ -1,404 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
-; JSAMPIMAGE input_buf, JDIMENSION input_row,
-; JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b) (b)+8 ; JDIMENSION out_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define input_row(b) (b)+16 ; JDIMENSION input_row
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 2
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_ycc_rgb_convert_mmx)
-
-EXTN(jsimd_ycc_rgb_convert_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [out_width(eax)] ; num_cols
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [input_row(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- push eax
- push edi
- push edx
- push ebx
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr0
- mov ebx, JSAMPROW [ebx] ; inptr1
- mov edx, JSAMPROW [edx] ; inptr2
- mov edi, JSAMPROW [edi] ; outptr
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
- alignx 16,7
-.columnloop:
-
- movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
- movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
-
- pcmpeqw mm4,mm4
- pcmpeqw mm7,mm7
- psrlw mm4,BYTE_BIT
- psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
- movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
-
- pand mm4,mm5 ; mm4=Cb(0246)=CbE
- psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO
- pand mm0,mm1 ; mm0=Cr(0246)=CrE
- psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO
-
- paddw mm4,mm7
- paddw mm5,mm7
- paddw mm0,mm7
- paddw mm1,mm7
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movq mm2,mm4 ; mm2=CbE
- movq mm3,mm5 ; mm3=CbO
- paddw mm4,mm4 ; mm4=2*CbE
- paddw mm5,mm5 ; mm5=2*CbO
- movq mm6,mm0 ; mm6=CrE
- movq mm7,mm1 ; mm7=CrO
- paddw mm0,mm0 ; mm0=2*CrE
- paddw mm1,mm1 ; mm1=2*CrO
-
- pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
- pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
- pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
- pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
-
- paddw mm4,[GOTOFF(eax,PW_ONE)]
- paddw mm5,[GOTOFF(eax,PW_ONE)]
- psraw mm4,1 ; mm4=(CbE * -FIX(0.22800))
- psraw mm5,1 ; mm5=(CbO * -FIX(0.22800))
- paddw mm0,[GOTOFF(eax,PW_ONE)]
- paddw mm1,[GOTOFF(eax,PW_ONE)]
- psraw mm0,1 ; mm0=(CrE * FIX(0.40200))
- psraw mm1,1 ; mm1=(CrO * FIX(0.40200))
-
- paddw mm4,mm2
- paddw mm5,mm3
- paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
- paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
- paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
- paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
- movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
-
- movq mm4,mm2
- movq mm5,mm3
- punpcklwd mm2,mm6
- punpckhwd mm4,mm6
- pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd mm3,mm7
- punpckhwd mm5,mm7
- pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
- paddd mm2,[GOTOFF(eax,PD_ONEHALF)]
- paddd mm4,[GOTOFF(eax,PD_ONEHALF)]
- psrad mm2,SCALEBITS
- psrad mm4,SCALEBITS
- paddd mm3,[GOTOFF(eax,PD_ONEHALF)]
- paddd mm5,[GOTOFF(eax,PD_ONEHALF)]
- psrad mm3,SCALEBITS
- psrad mm5,SCALEBITS
-
- packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
- packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
- psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
- psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
- movq mm5, MMWORD [esi] ; mm5=Y(01234567)
-
- pcmpeqw mm4,mm4
- psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
- pand mm4,mm5 ; mm4=Y(0246)=YE
- psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO
-
- paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
- paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
- packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
- packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
- paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
- paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
- packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
- packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
- paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
- paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
- packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
- packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
- ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
- ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
- ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
- punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
- punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07)
- punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27)
-
- movq mmG,mmA
- movq mmH,mmA
- punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03)
- punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07)
-
- psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
- psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
-
- movq mmC,mmD
- movq mmB,mmD
- punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14)
- punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --)
-
- psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
-
- movq mmF,mmE
- punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25)
- punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --)
-
- punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12)
- punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05)
- punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27)
-
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
- sub ecx, byte SIZEOF_MMWORD
- jz short .nextrow
-
- add esi, byte SIZEOF_MMWORD ; inptr0
- add ebx, byte SIZEOF_MMWORD ; inptr1
- add edx, byte SIZEOF_MMWORD ; inptr2
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
- jmp near .columnloop
- alignx 16,7
-
-.column_st16:
- lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
- cmp ecx, byte 2*SIZEOF_MMWORD
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq mmA,mmC
- sub ecx, byte 2*SIZEOF_MMWORD
- add edi, byte 2*SIZEOF_MMWORD
- jmp short .column_st4
-.column_st8:
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA,mmE
- sub ecx, byte SIZEOF_MMWORD
- add edi, byte SIZEOF_MMWORD
-.column_st4:
- movd eax,mmA
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st2
- mov DWORD [edi+0*SIZEOF_DWORD], eax
- psrlq mmA,DWORD_BIT
- movd eax,mmA
- sub ecx, byte SIZEOF_DWORD
- add edi, byte SIZEOF_DWORD
-.column_st2:
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [edi+0*SIZEOF_WORD], ax
- shr eax,WORD_BIT
- sub ecx, byte SIZEOF_WORD
- add edi, byte SIZEOF_WORD
-.column_st1:
- cmp ecx, byte SIZEOF_BYTE
- jb short .nextrow
- mov BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
- pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
- pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
- pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
- pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
- ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
- ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
- ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
- ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
- punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
- punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36)
- punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17)
- punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37)
-
- movq mmC,mmA
- punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32)
- punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36)
- movq mmG,mmB
- punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33)
- punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37)
-
- movq mmD,mmA
- punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31)
- punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33)
- movq mmH,mmC
- punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35)
- punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37)
-
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
- movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
- sub ecx, byte SIZEOF_MMWORD
- jz short .nextrow
-
- add esi, byte SIZEOF_MMWORD ; inptr0
- add ebx, byte SIZEOF_MMWORD ; inptr1
- add edx, byte SIZEOF_MMWORD ; inptr2
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
- jmp near .columnloop
- alignx 16,7
-
-.column_st16:
- cmp ecx, byte SIZEOF_MMWORD/2
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq mmA,mmC
- movq mmD,mmH
- sub ecx, byte SIZEOF_MMWORD/2
- add edi, byte 2*SIZEOF_MMWORD
-.column_st8:
- cmp ecx, byte SIZEOF_MMWORD/4
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA,mmD
- sub ecx, byte SIZEOF_MMWORD/4
- add edi, byte 1*SIZEOF_MMWORD
-.column_st4:
- cmp ecx, byte SIZEOF_MMWORD/8
- jb short .nextrow
- movd DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- alignx 16,7
-
-.nextrow:
- pop ecx
- pop esi
- pop ebx
- pop edx
- pop edi
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW ; output_buf
- dec eax ; num_rows
- jg near .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jdcolext-sse2-64.asm b/media/libjpeg/simd/jdcolext-sse2-64.asm
deleted file mode 100644
index 4634066c45..0000000000
--- a/media/libjpeg/simd/jdcolext-sse2-64.asm
+++ /dev/null
@@ -1,440 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-; JSAMPIMAGE input_buf, JDIMENSION input_row,
-; JSAMPARRAY output_buf, int num_rows)
-;
-
-; r10 = JDIMENSION out_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION input_row
-; r13 = JSAMPARRAY output_buf
-; r14 = int num_rows
-
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_ycc_rgb_convert_sse2)
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov ecx, r10d ; num_cols
- test rcx,rcx
- jz near .return
-
- push rcx
-
- mov rdi, r11
- mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
- lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
- lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
- lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
- pop rcx
-
- mov rdi, r13
- mov eax, r14d
- test rax,rax
- jle near .return
-.rowloop:
- push rax
- push rdi
- push rdx
- push rbx
- push rsi
- push rcx ; col
-
- mov rsi, JSAMPROW [rsi] ; inptr0
- mov rbx, JSAMPROW [rbx] ; inptr1
- mov rdx, JSAMPROW [rdx] ; inptr2
- mov rdi, JSAMPROW [rdi] ; outptr
-.columnloop:
-
- movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
- movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- pcmpeqw xmm7,xmm7
- psrlw xmm4,BYTE_BIT
- psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
- movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
- pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
- psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
- pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
- psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
-
- paddw xmm4,xmm7
- paddw xmm5,xmm7
- paddw xmm0,xmm7
- paddw xmm1,xmm7
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movdqa xmm2,xmm4 ; xmm2=CbE
- movdqa xmm3,xmm5 ; xmm3=CbO
- paddw xmm4,xmm4 ; xmm4=2*CbE
- paddw xmm5,xmm5 ; xmm5=2*CbO
- movdqa xmm6,xmm0 ; xmm6=CrE
- movdqa xmm7,xmm1 ; xmm7=CrO
- paddw xmm0,xmm0 ; xmm0=2*CrE
- paddw xmm1,xmm1 ; xmm1=2*CrO
-
- pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
- pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
- pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
- pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
-
- paddw xmm4,[rel PW_ONE]
- paddw xmm5,[rel PW_ONE]
- psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
- psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
- paddw xmm0,[rel PW_ONE]
- paddw xmm1,[rel PW_ONE]
- psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
- psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
-
- paddw xmm4,xmm2
- paddw xmm5,xmm3
- paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
- paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
- paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
- paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
-
- movdqa xmm4,xmm2
- movdqa xmm5,xmm3
- punpcklwd xmm2,xmm6
- punpckhwd xmm4,xmm6
- pmaddwd xmm2,[rel PW_MF0344_F0285]
- pmaddwd xmm4,[rel PW_MF0344_F0285]
- punpcklwd xmm3,xmm7
- punpckhwd xmm5,xmm7
- pmaddwd xmm3,[rel PW_MF0344_F0285]
- pmaddwd xmm5,[rel PW_MF0344_F0285]
-
- paddd xmm2,[rel PD_ONEHALF]
- paddd xmm4,[rel PD_ONEHALF]
- psrad xmm2,SCALEBITS
- psrad xmm4,SCALEBITS
- paddd xmm3,[rel PD_ONEHALF]
- paddd xmm5,[rel PD_ONEHALF]
- psrad xmm3,SCALEBITS
- psrad xmm5,SCALEBITS
-
- packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
- packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
- psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
- psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
- movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
- pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
- psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
-
- paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
- paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
- packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
- packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
-
- paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
- paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
- packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
- packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
-
- paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
- paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
- packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
- packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
- punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
- movdqa xmmG,xmmA
- movdqa xmmH,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
- punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
- psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
- psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
- movdqa xmmC,xmmD
- movdqa xmmB,xmmD
- punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
- punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
- psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
- movdqa xmmF,xmmE
- punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
- punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
- pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
- movdqa xmmB,xmmE
- punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
- punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
- punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
- pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
- movdqa xmmB,xmmF
- punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
- punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
- punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
- punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test rdi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub rcx, byte SIZEOF_XMMWORD
- jz near .nextrow
-
- add rsi, byte SIZEOF_XMMWORD ; inptr0
- add rbx, byte SIZEOF_XMMWORD ; inptr1
- add rdx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
-
-.column_st32:
- lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
- cmp rcx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmF
- sub rcx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
-.column_st16:
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st15
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub rcx, byte SIZEOF_XMMWORD
-.column_st15:
- ; Store the lower 8 bytes of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_MMWORD
- jb short .column_st7
- movq XMM_MMWORD [rdi], xmmA
- add rdi, byte SIZEOF_MMWORD
- sub rcx, byte SIZEOF_MMWORD
- psrldq xmmA, SIZEOF_MMWORD
-.column_st7:
- ; Store the lower 4 bytes of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_DWORD
- jb short .column_st3
- movd XMM_DWORD [rdi], xmmA
- add rdi, byte SIZEOF_DWORD
- sub rcx, byte SIZEOF_DWORD
- psrldq xmmA, SIZEOF_DWORD
-.column_st3:
- ; Store the lower 2 bytes of rax to the output when it has enough
- ; space.
- movd eax, xmmA
- cmp rcx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [rdi], ax
- add rdi, byte SIZEOF_WORD
- sub rcx, byte SIZEOF_WORD
- shr rax, 16
-.column_st1:
- ; Store the lower 1 byte of rax to the output when it has enough
- ; space.
- test rcx, rcx
- jz short .nextrow
- mov BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%else
- pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%endif
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
- punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
- punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
- movdqa xmmC,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
- punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
- movdqa xmmG,xmmB
- punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
- punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- movdqa xmmH,xmmC
- punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test rdi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
- movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub rcx, byte SIZEOF_XMMWORD
- jz near .nextrow
-
- add rsi, byte SIZEOF_XMMWORD ; inptr0
- add rbx, byte SIZEOF_XMMWORD ; inptr1
- add rdx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
-
-.column_st32:
- cmp rcx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmC
- movdqa xmmD,xmmH
- sub rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
- cmp rcx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
- ; Store two pixels (8 bytes) of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_XMMWORD/8
- jb short .column_st7
- movq MMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD/8*4
- sub rcx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
- ; Store one pixel (4 bytes) of xmmA to the output when it has enough
- ; space.
- test rcx, rcx
- jz short .nextrow
- movd XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.nextrow:
- pop rcx
- pop rsi
- pop rbx
- pop rdx
- pop rdi
- pop rax
-
- add rsi, byte SIZEOF_JSAMPROW
- add rbx, byte SIZEOF_JSAMPROW
- add rdx, byte SIZEOF_JSAMPROW
- add rdi, byte SIZEOF_JSAMPROW ; output_buf
- dec rax ; num_rows
- jg near .rowloop
-
- sfence ; flush the write buffer
-
-.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jdcolext-sse2.asm b/media/libjpeg/simd/jdcolext-sse2.asm
deleted file mode 100644
index 682aef35fc..0000000000
--- a/media/libjpeg/simd/jdcolext-sse2.asm
+++ /dev/null
@@ -1,459 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-; JSAMPIMAGE input_buf, JDIMENSION input_row,
-; JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b) (b)+8 ; JDIMENSION out_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define input_row(b) (b)+16 ; JDIMENSION input_row
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_ycc_rgb_convert_sse2)
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [out_width(eax)] ; num_cols
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [input_row(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- push eax
- push edi
- push edx
- push ebx
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr0
- mov ebx, JSAMPROW [ebx] ; inptr1
- mov edx, JSAMPROW [edx] ; inptr2
- mov edi, JSAMPROW [edi] ; outptr
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
- alignx 16,7
-.columnloop:
-
- movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
- movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- pcmpeqw xmm7,xmm7
- psrlw xmm4,BYTE_BIT
- psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
- movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
- pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
- psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
- pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
- psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
-
- paddw xmm4,xmm7
- paddw xmm5,xmm7
- paddw xmm0,xmm7
- paddw xmm1,xmm7
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movdqa xmm2,xmm4 ; xmm2=CbE
- movdqa xmm3,xmm5 ; xmm3=CbO
- paddw xmm4,xmm4 ; xmm4=2*CbE
- paddw xmm5,xmm5 ; xmm5=2*CbO
- movdqa xmm6,xmm0 ; xmm6=CrE
- movdqa xmm7,xmm1 ; xmm7=CrO
- paddw xmm0,xmm0 ; xmm0=2*CrE
- paddw xmm1,xmm1 ; xmm1=2*CrO
-
- pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
- pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
- pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
- pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
-
- paddw xmm4,[GOTOFF(eax,PW_ONE)]
- paddw xmm5,[GOTOFF(eax,PW_ONE)]
- psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
- psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
- paddw xmm0,[GOTOFF(eax,PW_ONE)]
- paddw xmm1,[GOTOFF(eax,PW_ONE)]
- psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
- psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
-
- paddw xmm4,xmm2
- paddw xmm5,xmm3
- paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
- paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
- paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
- paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
-
- movdqa xmm4,xmm2
- movdqa xmm5,xmm3
- punpcklwd xmm2,xmm6
- punpckhwd xmm4,xmm6
- pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd xmm3,xmm7
- punpckhwd xmm5,xmm7
- pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
- paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm4,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm2,SCALEBITS
- psrad xmm4,SCALEBITS
- paddd xmm3,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm3,SCALEBITS
- psrad xmm5,SCALEBITS
-
- packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
- packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
- psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
- psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
- movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
- pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
- psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
-
- paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
- paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
- packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
- packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
-
- paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
- paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
- packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
- packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
-
- paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
- paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
- packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
- packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
- punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
- movdqa xmmG,xmmA
- movdqa xmmH,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
- punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
- psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
- psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
- movdqa xmmC,xmmD
- movdqa xmmB,xmmD
- punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
- punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
- psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
- movdqa xmmF,xmmE
- punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
- punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
- pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
- movdqa xmmB,xmmE
- punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
- punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
- punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
- pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
- movdqa xmmB,xmmF
- punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
- punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
- punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
- punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub ecx, byte SIZEOF_XMMWORD
- jz near .nextrow
-
- add esi, byte SIZEOF_XMMWORD ; inptr0
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
-
-.column_st32:
- lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
- cmp ecx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmF
- sub ecx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
-.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD
-.column_st15:
- ; Store the lower 8 bytes of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_MMWORD
- sub ecx, byte SIZEOF_MMWORD
- psrldq xmmA, SIZEOF_MMWORD
-.column_st7:
- ; Store the lower 4 bytes of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st3
- movd XMM_DWORD [edi], xmmA
- add edi, byte SIZEOF_DWORD
- sub ecx, byte SIZEOF_DWORD
- psrldq xmmA, SIZEOF_DWORD
-.column_st3:
- ; Store the lower 2 bytes of eax to the output when it has enough
- ; space.
- movd eax, xmmA
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [edi], ax
- add edi, byte SIZEOF_WORD
- sub ecx, byte SIZEOF_WORD
- shr eax, 16
-.column_st1:
- ; Store the lower 1 byte of eax to the output when it has enough
- ; space.
- test ecx, ecx
- jz short .nextrow
- mov BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%else
- pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%endif
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
- punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
- punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
- movdqa xmmC,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
- punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
- movdqa xmmG,xmmB
- punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
- punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- movdqa xmmH,xmmC
- punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub ecx, byte SIZEOF_XMMWORD
- jz near .nextrow
-
- add esi, byte SIZEOF_XMMWORD ; inptr0
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
-
-.column_st32:
- cmp ecx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmC
- movdqa xmmD,xmmH
- sub ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
- ; Store two pixels (8 bytes) of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_XMMWORD/8
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD/8*4
- sub ecx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
- ; Store one pixel (4 bytes) of xmmA to the output when it has enough
- ; space.
- test ecx, ecx
- jz short .nextrow
- movd XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- alignx 16,7
-
-.nextrow:
- pop ecx
- pop esi
- pop ebx
- pop edx
- pop edi
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW ; output_buf
- dec eax ; num_rows
- jg near .rowloop
-
- sfence ; flush the write buffer
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jdcolor-altivec.c b/media/libjpeg/simd/jdcolor-altivec.c
deleted file mode 100644
index 0dc4c427c2..0000000000
--- a/media/libjpeg/simd/jdcolor-altivec.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* YCC --> RGB CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_344 22554 /* FIX(0.34414) */
-#define F_0_714 46802 /* FIX(0.71414) */
-#define F_1_402 91881 /* FIX(1.40200) */
-#define F_1_772 116130 /* FIX(1.77200) */
-#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
-#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
-#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
-#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
-#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
-#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
-#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
diff --git a/media/libjpeg/simd/jdcolor-mmx.asm b/media/libjpeg/simd/jdcolor-mmx.asm
deleted file mode 100644
index 4e58031dd0..0000000000
--- a/media/libjpeg/simd/jdcolor-mmx.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_ycc_rgb_convert_mmx)
-
-EXTN(jconst_ycc_rgb_convert_mmx):
-
-PW_F0402 times 4 dw F_0_402
-PW_MF0228 times 4 dw -F_0_228
-PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
-PW_ONE times 4 dw 1
-PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
-%include "jdcolext-mmx.asm"
diff --git a/media/libjpeg/simd/jdcolor-sse2-64.asm b/media/libjpeg/simd/jdcolor-sse2-64.asm
deleted file mode 100644
index d2bf210007..0000000000
--- a/media/libjpeg/simd/jdcolor-sse2-64.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_ycc_rgb_convert_sse2)
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402 times 8 dw F_0_402
-PW_MF0228 times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE times 8 dw 1
-PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdcolext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jdcolor-sse2.asm b/media/libjpeg/simd/jdcolor-sse2.asm
deleted file mode 100644
index 7ff5d05d0c..0000000000
--- a/media/libjpeg/simd/jdcolor-sse2.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_ycc_rgb_convert_sse2)
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402 times 8 dw F_0_402
-PW_MF0228 times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE times 8 dw 1
-PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/jdmerge-altivec.c b/media/libjpeg/simd/jdmerge-altivec.c
deleted file mode 100644
index 6a35f2019c..0000000000
--- a/media/libjpeg/simd/jdmerge-altivec.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_344 22554 /* FIX(0.34414) */
-#define F_0_714 46802 /* FIX(0.71414) */
-#define F_1_402 91881 /* FIX(1.40200) */
-#define F_1_772 116130 /* FIX(1.77200) */
-#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
-#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
-#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
-#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
-#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgb_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgb_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgbx_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgbx_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
-#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
-#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgr_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgr_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgrx_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgrx_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxbgr_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxbgr_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxrgb_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxrgb_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/media/libjpeg/simd/jdmerge-mmx.asm b/media/libjpeg/simd/jdmerge-mmx.asm
deleted file mode 100644
index ee58bff1c6..0000000000
--- a/media/libjpeg/simd/jdmerge-mmx.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_merged_upsample_mmx)
-
-EXTN(jconst_merged_upsample_mmx):
-
-PW_F0402 times 4 dw F_0_402
-PW_MF0228 times 4 dw -F_0_228
-PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
-PW_ONE times 4 dw 1
-PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
diff --git a/media/libjpeg/simd/jdmerge-sse2-64.asm b/media/libjpeg/simd/jdmerge-sse2-64.asm
deleted file mode 100644
index 244bd40234..0000000000
--- a/media/libjpeg/simd/jdmerge-sse2-64.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_merged_upsample_sse2)
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402 times 8 dw F_0_402
-PW_MF0228 times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE times 8 dw 1
-PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jdmerge-sse2.asm b/media/libjpeg/simd/jdmerge-sse2.asm
deleted file mode 100644
index 236de5a385..0000000000
--- a/media/libjpeg/simd/jdmerge-sse2.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS 16
-
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_merged_upsample_sse2)
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402 times 8 dw F_0_402
-PW_MF0228 times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE times 8 dw 1
-PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/jdmrgext-mmx.asm b/media/libjpeg/simd/jdmrgext-mmx.asm
deleted file mode 100644
index 63f45cf373..0000000000
--- a/media/libjpeg/simd/jdmrgext-mmx.asm
+++ /dev/null
@@ -1,463 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
-; JSAMPIMAGE input_buf,
-; JDIMENSION in_row_group_ctr,
-; JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8 ; JDIMENSION output_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 3
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-EXTN(jsimd_h2v1_merged_upsample_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [output_width(eax)] ; col
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [in_row_group_ctr(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
- mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
- mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
- mov edi, JSAMPROW [edi] ; outptr
-
- pop ecx ; col
-
- alignx 16,7
-.columnloop:
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
-
- movq mm6, MMWORD [ebx] ; mm6=Cb(01234567)
- movq mm7, MMWORD [edx] ; mm7=Cr(01234567)
-
- pxor mm1,mm1 ; mm1=(all 0's)
- pcmpeqw mm3,mm3
- psllw mm3,7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
-
- movq mm4,mm6
- punpckhbw mm6,mm1 ; mm6=Cb(4567)=CbH
- punpcklbw mm4,mm1 ; mm4=Cb(0123)=CbL
- movq mm0,mm7
- punpckhbw mm7,mm1 ; mm7=Cr(4567)=CrH
- punpcklbw mm0,mm1 ; mm0=Cr(0123)=CrL
-
- paddw mm6,mm3
- paddw mm4,mm3
- paddw mm7,mm3
- paddw mm0,mm3
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movq mm5,mm6 ; mm5=CbH
- movq mm2,mm4 ; mm2=CbL
- paddw mm6,mm6 ; mm6=2*CbH
- paddw mm4,mm4 ; mm4=2*CbL
- movq mm1,mm7 ; mm1=CrH
- movq mm3,mm0 ; mm3=CrL
- paddw mm7,mm7 ; mm7=2*CrH
- paddw mm0,mm0 ; mm0=2*CrL
-
- pmulhw mm6,[GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800))
- pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800))
- pmulhw mm7,[GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200))
- pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200))
-
- paddw mm6,[GOTOFF(eax,PW_ONE)]
- paddw mm4,[GOTOFF(eax,PW_ONE)]
- psraw mm6,1 ; mm6=(CbH * -FIX(0.22800))
- psraw mm4,1 ; mm4=(CbL * -FIX(0.22800))
- paddw mm7,[GOTOFF(eax,PW_ONE)]
- paddw mm0,[GOTOFF(eax,PW_ONE)]
- psraw mm7,1 ; mm7=(CrH * FIX(0.40200))
- psraw mm0,1 ; mm0=(CrL * FIX(0.40200))
-
- paddw mm6,mm5
- paddw mm4,mm2
- paddw mm6,mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
- paddw mm4,mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
- paddw mm7,mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
- paddw mm0,mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
-
- movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H
- movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H
-
- movq mm6,mm5
- movq mm7,mm2
- punpcklwd mm5,mm1
- punpckhwd mm6,mm1
- pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm6,[GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd mm2,mm3
- punpckhwd mm7,mm3
- pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
- paddd mm5,[GOTOFF(eax,PD_ONEHALF)]
- paddd mm6,[GOTOFF(eax,PD_ONEHALF)]
- psrad mm5,SCALEBITS
- psrad mm6,SCALEBITS
- paddd mm2,[GOTOFF(eax,PD_ONEHALF)]
- paddd mm7,[GOTOFF(eax,PD_ONEHALF)]
- psrad mm2,SCALEBITS
- psrad mm7,SCALEBITS
-
- packssdw mm5,mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
- packssdw mm2,mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
- psubw mm5,mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
- psubw mm2,mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
- movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H
-
- mov al,2 ; Yctr
- jmp short .Yloop_1st
- alignx 16,7
-
-.Yloop_2nd:
- movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H
- movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H
- movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H
- alignx 16,7
-
-.Yloop_1st:
- movq mm7, MMWORD [esi] ; mm7=Y(01234567)
-
- pcmpeqw mm6,mm6
- psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
- pand mm6,mm7 ; mm6=Y(0246)=YE
- psrlw mm7,BYTE_BIT ; mm7=Y(1357)=YO
-
- movq mm1,mm0 ; mm1=mm0=(R-Y)(L/H)
- movq mm3,mm2 ; mm3=mm2=(G-Y)(L/H)
- movq mm5,mm4 ; mm5=mm4=(B-Y)(L/H)
-
- paddw mm0,mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
- paddw mm1,mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
- packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
- packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
- paddw mm2,mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
- paddw mm3,mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
- packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
- packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
- paddw mm4,mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
- paddw mm5,mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
- packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
- packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
- ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
- ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
- ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
- punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
- punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07)
- punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27)
-
- movq mmG,mmA
- movq mmH,mmA
- punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03)
- punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07)
-
- psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
- psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
-
- movq mmC,mmD
- movq mmB,mmD
- punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14)
- punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --)
-
- psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
-
- movq mmF,mmE
- punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25)
- punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --)
-
- punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12)
- punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05)
- punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27)
-
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
- sub ecx, byte SIZEOF_MMWORD
- jz near .endcolumn
-
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
- add esi, byte SIZEOF_MMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
-
- add ebx, byte SIZEOF_MMWORD ; inptr1
- add edx, byte SIZEOF_MMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
-
-.column_st16:
- lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
- cmp ecx, byte 2*SIZEOF_MMWORD
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq mmA,mmC
- sub ecx, byte 2*SIZEOF_MMWORD
- add edi, byte 2*SIZEOF_MMWORD
- jmp short .column_st4
-.column_st8:
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA,mmE
- sub ecx, byte SIZEOF_MMWORD
- add edi, byte SIZEOF_MMWORD
-.column_st4:
- movd eax,mmA
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st2
- mov DWORD [edi+0*SIZEOF_DWORD], eax
- psrlq mmA,DWORD_BIT
- movd eax,mmA
- sub ecx, byte SIZEOF_DWORD
- add edi, byte SIZEOF_DWORD
-.column_st2:
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [edi+0*SIZEOF_WORD], ax
- shr eax,WORD_BIT
- sub ecx, byte SIZEOF_WORD
- add edi, byte SIZEOF_WORD
-.column_st1:
- cmp ecx, byte SIZEOF_BYTE
- jb short .endcolumn
- mov BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
- pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
- pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
- pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
- pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
- ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
- ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
- ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
- ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
- punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
- punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36)
- punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17)
- punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37)
-
- movq mmC,mmA
- punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32)
- punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36)
- movq mmG,mmB
- punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33)
- punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37)
-
- movq mmD,mmA
- punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31)
- punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33)
- movq mmH,mmC
- punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35)
- punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37)
-
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
- movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
- sub ecx, byte SIZEOF_MMWORD
- jz short .endcolumn
-
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
- add esi, byte SIZEOF_MMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
-
- add ebx, byte SIZEOF_MMWORD ; inptr1
- add edx, byte SIZEOF_MMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
-
-.column_st16:
- cmp ecx, byte SIZEOF_MMWORD/2
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq mmA,mmC
- movq mmD,mmH
- sub ecx, byte SIZEOF_MMWORD/2
- add edi, byte 2*SIZEOF_MMWORD
-.column_st8:
- cmp ecx, byte SIZEOF_MMWORD/4
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA,mmD
- sub ecx, byte SIZEOF_MMWORD/4
- add edi, byte 1*SIZEOF_MMWORD
-.column_st4:
- cmp ecx, byte SIZEOF_MMWORD/8
- jb short .endcolumn
- movd DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
-; JSAMPIMAGE input_buf,
-; JDIMENSION in_row_group_ctr,
-; JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8 ; JDIMENSION output_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-
- align 16
- global EXTN(jsimd_h2v2_merged_upsample_mmx)
-
-EXTN(jsimd_h2v2_merged_upsample_mmx):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov eax, JDIMENSION [output_width(ebp)]
-
- mov edi, JSAMPIMAGE [input_buf(ebp)]
- mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(ebp)]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
-
- push edx ; inptr2
- push ebx ; inptr1
- push esi ; inptr00
- mov ebx,esp
-
- push edi ; output_buf (outptr0)
- push ecx ; in_row_group_ctr
- push ebx ; input_buf
- push eax ; output_width
-
- call near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
- add esi, byte SIZEOF_JSAMPROW ; inptr01
- add edi, byte SIZEOF_JSAMPROW ; outptr1
- mov POINTER [ebx+0*SIZEOF_POINTER], esi
- mov POINTER [ebx-1*SIZEOF_POINTER], edi
-
- call near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
- add esp, byte 7*SIZEOF_DWORD
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jdmrgext-sse2-64.asm b/media/libjpeg/simd/jdmrgext-sse2-64.asm
deleted file mode 100644
index ad74c5ff4d..0000000000
--- a/media/libjpeg/simd/jdmrgext-sse2-64.asm
+++ /dev/null
@@ -1,537 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-; JSAMPIMAGE input_buf,
-; JDIMENSION in_row_group_ctr,
-; JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 3
-
- align 16
- global EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov ecx, r10d ; col
- test rcx,rcx
- jz near .return
-
- push rcx
-
- mov rdi, r11
- mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
- mov rdi, r13
- mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
- mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
- mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
- mov rdi, JSAMPROW [rdi] ; outptr
-
- pop rcx ; col
-
-.columnloop:
-
- movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
- movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
-
- pxor xmm1,xmm1 ; xmm1=(all 0's)
- pcmpeqw xmm3,xmm3
- psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
- movdqa xmm4,xmm6
- punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
- punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
- movdqa xmm0,xmm7
- punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
- punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
-
- paddw xmm6,xmm3
- paddw xmm4,xmm3
- paddw xmm7,xmm3
- paddw xmm0,xmm3
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movdqa xmm5,xmm6 ; xmm5=CbH
- movdqa xmm2,xmm4 ; xmm2=CbL
- paddw xmm6,xmm6 ; xmm6=2*CbH
- paddw xmm4,xmm4 ; xmm4=2*CbL
- movdqa xmm1,xmm7 ; xmm1=CrH
- movdqa xmm3,xmm0 ; xmm3=CrL
- paddw xmm7,xmm7 ; xmm7=2*CrH
- paddw xmm0,xmm0 ; xmm0=2*CrL
-
- pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
- pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
- pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
- pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
-
- paddw xmm6,[rel PW_ONE]
- paddw xmm4,[rel PW_ONE]
- psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
- psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
- paddw xmm7,[rel PW_ONE]
- paddw xmm0,[rel PW_ONE]
- psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
- psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
-
- paddw xmm6,xmm5
- paddw xmm4,xmm2
- paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
- paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
- paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
- paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
- movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
-
- movdqa xmm6,xmm5
- movdqa xmm7,xmm2
- punpcklwd xmm5,xmm1
- punpckhwd xmm6,xmm1
- pmaddwd xmm5,[rel PW_MF0344_F0285]
- pmaddwd xmm6,[rel PW_MF0344_F0285]
- punpcklwd xmm2,xmm3
- punpckhwd xmm7,xmm3
- pmaddwd xmm2,[rel PW_MF0344_F0285]
- pmaddwd xmm7,[rel PW_MF0344_F0285]
-
- paddd xmm5,[rel PD_ONEHALF]
- paddd xmm6,[rel PD_ONEHALF]
- psrad xmm5,SCALEBITS
- psrad xmm6,SCALEBITS
- paddd xmm2,[rel PD_ONEHALF]
- paddd xmm7,[rel PD_ONEHALF]
- psrad xmm2,SCALEBITS
- psrad xmm7,SCALEBITS
-
- packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
- packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
- psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
- psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
- movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
-
- mov al,2 ; Yctr
- jmp short .Yloop_1st
-
-.Yloop_2nd:
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
- movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
- movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
-
-.Yloop_1st:
- movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
-
- pcmpeqw xmm6,xmm6
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
- pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
- psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
-
- movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
- movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
- movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
-
- paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
- paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
- packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
- packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
-
- paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
- paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
- packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
- packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
-
- paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
- paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
- packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
- packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
- punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
- movdqa xmmG,xmmA
- movdqa xmmH,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
- punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
- psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
- psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
- movdqa xmmC,xmmD
- movdqa xmmB,xmmD
- punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
- punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
- psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
- movdqa xmmF,xmmE
- punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
- punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
- pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
- movdqa xmmB,xmmE
- punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
- punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
- punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
- pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
- movdqa xmmB,xmmF
- punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
- punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
- punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
- punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test rdi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub rcx, byte SIZEOF_XMMWORD
- jz near .endcolumn
-
- add rsi, byte SIZEOF_XMMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
-
- add rbx, byte SIZEOF_XMMWORD ; inptr1
- add rdx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
-
-.column_st32:
- lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
- cmp rcx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmF
- sub rcx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
-.column_st16:
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st15
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub rcx, byte SIZEOF_XMMWORD
-.column_st15:
- ; Store the lower 8 bytes of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_MMWORD
- jb short .column_st7
- movq XMM_MMWORD [rdi], xmmA
- add rdi, byte SIZEOF_MMWORD
- sub rcx, byte SIZEOF_MMWORD
- psrldq xmmA, SIZEOF_MMWORD
-.column_st7:
- ; Store the lower 4 bytes of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_DWORD
- jb short .column_st3
- movd XMM_DWORD [rdi], xmmA
- add rdi, byte SIZEOF_DWORD
- sub rcx, byte SIZEOF_DWORD
- psrldq xmmA, SIZEOF_DWORD
-.column_st3:
- ; Store the lower 2 bytes of rax to the output when it has enough
- ; space.
- movd eax, xmmA
- cmp rcx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [rdi], ax
- add rdi, byte SIZEOF_WORD
- sub rcx, byte SIZEOF_WORD
- shr rax, 16
-.column_st1:
- ; Store the lower 1 byte of rax to the output when it has enough
- ; space.
- test rcx, rcx
- jz short .endcolumn
- mov BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%else
- pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%endif
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
- punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
- punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
- movdqa xmmC,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
- punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
- movdqa xmmG,xmmB
- punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
- punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- movdqa xmmH,xmmC
- punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test rdi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
- movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub rcx, byte SIZEOF_XMMWORD
- jz near .endcolumn
-
- add rsi, byte SIZEOF_XMMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
-
- add rbx, byte SIZEOF_XMMWORD ; inptr1
- add rdx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
-
-.column_st32:
- cmp rcx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmC
- movdqa xmmD,xmmH
- sub rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
- cmp rcx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
- ; Store two pixels (8 bytes) of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_XMMWORD/8
- jb short .column_st7
- movq XMM_MMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD/8*4
- sub rcx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
- ; Store one pixel (4 bytes) of xmmA to the output when it has enough
- ; space.
- test rcx, rcx
- jz short .endcolumn
- movd XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
- sfence ; flush the write buffer
-
-.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-; JSAMPIMAGE input_buf,
-; JDIMENSION in_row_group_ctr,
-; JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
- align 16
- global EXTN(jsimd_h2v2_merged_upsample_sse2)
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- mov eax, r10d
-
- mov rdi, r11
- mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
- mov rdi, r13
- lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-
- push rdx ; inptr2
- push rbx ; inptr1
- push rsi ; inptr00
- mov rbx,rsp
-
- push rdi
- push rcx
- push rax
-
- %ifdef WIN64
- mov r8, rcx
- mov r9, rdi
- mov rcx, rax
- mov rdx, rbx
- %else
- mov rdx, rcx
- mov rcx, rdi
- mov rdi, rax
- mov rsi, rbx
- %endif
-
- call EXTN(jsimd_h2v1_merged_upsample_sse2)
-
- pop rax
- pop rcx
- pop rdi
- pop rsi
- pop rbx
- pop rdx
-
- add rdi, byte SIZEOF_JSAMPROW ; outptr1
- add rsi, byte SIZEOF_JSAMPROW ; inptr01
-
- push rdx ; inptr2
- push rbx ; inptr1
- push rsi ; inptr00
- mov rbx,rsp
-
- push rdi
- push rcx
- push rax
-
- %ifdef WIN64
- mov r8, rcx
- mov r9, rdi
- mov rcx, rax
- mov rdx, rbx
- %else
- mov rdx, rcx
- mov rcx, rdi
- mov rdi, rax
- mov rsi, rbx
- %endif
-
- call EXTN(jsimd_h2v1_merged_upsample_sse2)
-
- pop rax
- pop rcx
- pop rdi
- pop rsi
- pop rbx
- pop rdx
-
- pop rbx
- uncollect_args
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jdmrgext-sse2.asm b/media/libjpeg/simd/jdmrgext-sse2.asm
deleted file mode 100644
index b50f698b49..0000000000
--- a/media/libjpeg/simd/jdmrgext-sse2.asm
+++ /dev/null
@@ -1,518 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-; JSAMPIMAGE input_buf,
-; JDIMENSION in_row_group_ctr,
-; JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8 ; JDIMENSION output_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 3
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [output_width(eax)] ; col
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [in_row_group_ctr(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
- mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
- mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
- mov edi, JSAMPROW [edi] ; outptr
-
- pop ecx ; col
-
- alignx 16,7
-.columnloop:
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
-
- movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
- movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
-
- pxor xmm1,xmm1 ; xmm1=(all 0's)
- pcmpeqw xmm3,xmm3
- psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
- movdqa xmm4,xmm6
- punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
- punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
- movdqa xmm0,xmm7
- punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
- punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
-
- paddw xmm6,xmm3
- paddw xmm4,xmm3
- paddw xmm7,xmm3
- paddw xmm0,xmm3
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movdqa xmm5,xmm6 ; xmm5=CbH
- movdqa xmm2,xmm4 ; xmm2=CbL
- paddw xmm6,xmm6 ; xmm6=2*CbH
- paddw xmm4,xmm4 ; xmm4=2*CbL
- movdqa xmm1,xmm7 ; xmm1=CrH
- movdqa xmm3,xmm0 ; xmm3=CrL
- paddw xmm7,xmm7 ; xmm7=2*CrH
- paddw xmm0,xmm0 ; xmm0=2*CrL
-
- pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
- pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
- pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
- pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
-
- paddw xmm6,[GOTOFF(eax,PW_ONE)]
- paddw xmm4,[GOTOFF(eax,PW_ONE)]
- psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
- psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
- paddw xmm7,[GOTOFF(eax,PW_ONE)]
- paddw xmm0,[GOTOFF(eax,PW_ONE)]
- psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
- psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
-
- paddw xmm6,xmm5
- paddw xmm4,xmm2
- paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
- paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
- paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
- paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
- movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
-
- movdqa xmm6,xmm5
- movdqa xmm7,xmm2
- punpcklwd xmm5,xmm1
- punpckhwd xmm6,xmm1
- pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd xmm2,xmm3
- punpckhwd xmm7,xmm3
- pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
- paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm6,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm5,SCALEBITS
- psrad xmm6,SCALEBITS
- paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm7,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm2,SCALEBITS
- psrad xmm7,SCALEBITS
-
- packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
- packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
- psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
- psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
- movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
-
- mov al,2 ; Yctr
- jmp short .Yloop_1st
- alignx 16,7
-
-.Yloop_2nd:
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
- movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
- movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
- alignx 16,7
-
-.Yloop_1st:
- movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
-
- pcmpeqw xmm6,xmm6
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
- pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
- psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
-
- movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
- movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
- movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
-
- paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
- paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
- packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
- packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
-
- paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
- paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
- packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
- packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
-
- paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
- paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
- packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
- packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
- punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
- movdqa xmmG,xmmA
- movdqa xmmH,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
- punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
- psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
- psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
- movdqa xmmC,xmmD
- movdqa xmmB,xmmD
- punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
- punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
- psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
- movdqa xmmF,xmmE
- punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
- punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
- pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
- movdqa xmmB,xmmE
- punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
- punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
- punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
- pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
- movdqa xmmB,xmmF
- punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
- punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
- punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
- punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub ecx, byte SIZEOF_XMMWORD
- jz near .endcolumn
-
- add esi, byte SIZEOF_XMMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
-
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
-
-.column_st32:
- lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
- cmp ecx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmF
- sub ecx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
-.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD
-.column_st15:
- ; Store the lower 8 bytes of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_MMWORD
- sub ecx, byte SIZEOF_MMWORD
- psrldq xmmA, SIZEOF_MMWORD
-.column_st7:
- ; Store the lower 4 bytes of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st3
- movd XMM_DWORD [edi], xmmA
- add edi, byte SIZEOF_DWORD
- sub ecx, byte SIZEOF_DWORD
- psrldq xmmA, SIZEOF_DWORD
-.column_st3:
- ; Store the lower 2 bytes of eax to the output when it has enough
- ; space.
- movd eax, xmmA
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [edi], ax
- add edi, byte SIZEOF_WORD
- sub ecx, byte SIZEOF_WORD
- shr eax, 16
-.column_st1:
- ; Store the lower 1 byte of eax to the output when it has enough
- ; space.
- test ecx, ecx
- jz short .endcolumn
- mov BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%else
- pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%endif
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
- punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
- punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
- movdqa xmmC,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
- punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
- movdqa xmmG,xmmB
- punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
- punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- movdqa xmmH,xmmC
- punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub ecx, byte SIZEOF_XMMWORD
- jz near .endcolumn
-
- add esi, byte SIZEOF_XMMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
-
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
-
-.column_st32:
- cmp ecx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmC
- movdqa xmmD,xmmH
- sub ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
- ; Store two pixels (8 bytes) of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_XMMWORD/8
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD/8*4
- sub ecx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
- ; Store one pixel (4 bytes) of xmmA to the output when it has enough
- ; space.
- test ecx, ecx
- jz short .endcolumn
- movd XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
- sfence ; flush the write buffer
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-; JSAMPIMAGE input_buf,
-; JDIMENSION in_row_group_ctr,
-; JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8 ; JDIMENSION output_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-
- align 16
- global EXTN(jsimd_h2v2_merged_upsample_sse2)
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov eax, POINTER [output_width(ebp)]
-
- mov edi, JSAMPIMAGE [input_buf(ebp)]
- mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(ebp)]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
-
- push edx ; inptr2
- push ebx ; inptr1
- push esi ; inptr00
- mov ebx,esp
-
- push edi ; output_buf (outptr0)
- push ecx ; in_row_group_ctr
- push ebx ; input_buf
- push eax ; output_width
-
- call near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
- add esi, byte SIZEOF_JSAMPROW ; inptr01
- add edi, byte SIZEOF_JSAMPROW ; outptr1
- mov POINTER [ebx+0*SIZEOF_POINTER], esi
- mov POINTER [ebx-1*SIZEOF_POINTER], edi
-
- call near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
- add esp, byte 7*SIZEOF_DWORD
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jdsample-mmx.asm b/media/libjpeg/simd/jdsample-mmx.asm
deleted file mode 100644
index 5e4fa7ae22..0000000000
--- a/media/libjpeg/simd/jdsample-mmx.asm
+++ /dev/null
@@ -1,736 +0,0 @@
-;
-; jdsample.asm - upsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_fancy_upsample_mmx)
-
-EXTN(jconst_fancy_upsample_mmx):
-
-PW_ONE times 4 dw 1
-PW_TWO times 4 dw 2
-PW_THREE times 4 dw 3
-PW_SEVEN times 4 dw 7
-PW_EIGHT times 4 dw 8
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter". This is a good compromise between
-; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
-; JDIMENSION downsampled_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
-
- align 16
- global EXTN(jsimd_h2v1_fancy_upsample_mmx)
-
-EXTN(jsimd_h2v1_fancy_upsample_mmx):
- push ebp
- mov ebp,esp
- pushpic ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
- test eax,eax
- jz near .return
-
- mov ecx, INT [max_v_samp(ebp)] ; rowctr
- test ecx,ecx
- jz near .return
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, POINTER [output_data_ptr(ebp)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
-.rowloop:
- push eax ; colctr
- push edi
- push esi
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr
-
- test eax, SIZEOF_MMWORD-1
- jz short .skip
- mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
-.skip:
- pxor mm0,mm0 ; mm0=(all 0's)
- pcmpeqb mm7,mm7
- psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
- pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
-
- add eax, byte SIZEOF_MMWORD-1
- and eax, byte -SIZEOF_MMWORD
- cmp eax, byte SIZEOF_MMWORD
- ja short .columnloop
- alignx 16,7
-
-.columnloop_last:
- pcmpeqb mm6,mm6
- psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
- pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
- jmp short .upsample
- alignx 16,7
-
-.columnloop:
- movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
- psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-
-.upsample:
- movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mm2,mm1
- movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7)
- psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
- psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
-
- por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6)
- por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8)
-
- movq mm7,mm1
- psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
-
- movq mm4,mm1
- punpcklbw mm1,mm0 ; mm1=( 0 1 2 3)
- punpckhbw mm4,mm0 ; mm4=( 4 5 6 7)
- movq mm5,mm2
- punpcklbw mm2,mm0 ; mm2=(-1 0 1 2)
- punpckhbw mm5,mm0 ; mm5=( 3 4 5 6)
- movq mm6,mm3
- punpcklbw mm3,mm0 ; mm3=( 1 2 3 4)
- punpckhbw mm6,mm0 ; mm6=( 5 6 7 8)
-
- pmullw mm1,[GOTOFF(ebx,PW_THREE)]
- pmullw mm4,[GOTOFF(ebx,PW_THREE)]
- paddw mm2,[GOTOFF(ebx,PW_ONE)]
- paddw mm5,[GOTOFF(ebx,PW_ONE)]
- paddw mm3,[GOTOFF(ebx,PW_TWO)]
- paddw mm6,[GOTOFF(ebx,PW_TWO)]
-
- paddw mm2,mm1
- paddw mm5,mm4
- psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6)
- psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14)
- paddw mm3,mm1
- paddw mm6,mm4
- psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7)
- psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15)
-
- psllw mm3,BYTE_BIT
- psllw mm6,BYTE_BIT
- por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
- por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
- movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
-
- sub eax, byte SIZEOF_MMWORD
- add esi, byte 1*SIZEOF_MMWORD ; inptr
- add edi, byte 2*SIZEOF_MMWORD ; outptr
- cmp eax, byte SIZEOF_MMWORD
- ja near .columnloop
- test eax,eax
- jnz near .columnloop_last
-
- pop esi
- pop edi
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_data
- add edi, byte SIZEOF_JSAMPROW ; output_data
- dec ecx ; rowctr
- jg near .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- poppic ebx
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
-; JDIMENSION downsampled_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 4
-%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr
-
- align 16
- global EXTN(jsimd_h2v2_fancy_upsample_mmx)
-
-EXTN(jsimd_h2v2_fancy_upsample_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov edx,eax ; edx = original ebp
- mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
- test eax,eax
- jz near .return
-
- mov ecx, INT [max_v_samp(edx)] ; rowctr
- test ecx,ecx
- jz near .return
-
- mov esi, JSAMPARRAY [input_data(edx)] ; input_data
- mov edi, POINTER [output_data_ptr(edx)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
-.rowloop:
- push eax ; colctr
- push ecx
- push edi
- push esi
-
- mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
- mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
- mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
-
- test eax, SIZEOF_MMWORD-1
- jz short .skip
- push edx
- mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
- pop edx
-.skip:
- ; -- process the first column block
-
- movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
- movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
- movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
-
- pushpic ebx
- movpic ebx, POINTER [gotptr] ; load GOT address
-
- pxor mm3,mm3 ; mm3=(all 0's)
- movq mm4,mm0
- punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3)
- punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7)
- movq mm5,mm1
- punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3)
- punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7)
- movq mm6,mm2
- punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3)
- punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7)
-
- pmullw mm0,[GOTOFF(ebx,PW_THREE)]
- pmullw mm4,[GOTOFF(ebx,PW_THREE)]
-
- pcmpeqb mm7,mm7
- psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
-
- paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
- paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
- paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
- paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
-
- movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
- movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
- movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
- movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
-
- pand mm1,mm7 ; mm1=( 0 - - -)
- pand mm2,mm7 ; mm2=( 0 - - -)
-
- movq MMWORD [wk(0)], mm1
- movq MMWORD [wk(1)], mm2
-
- poppic ebx
-
- add eax, byte SIZEOF_MMWORD-1
- and eax, byte -SIZEOF_MMWORD
- cmp eax, byte SIZEOF_MMWORD
- ja short .columnloop
- alignx 16,7
-
-.columnloop_last:
- ; -- process the last column block
-
- pushpic ebx
- movpic ebx, POINTER [gotptr] ; load GOT address
-
- pcmpeqb mm1,mm1
- psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
- movq mm2,mm1
-
- pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
- pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
-
- movq MMWORD [wk(2)], mm1
- movq MMWORD [wk(3)], mm2
-
- jmp short .upsample
- alignx 16,7
-
-.columnloop:
- ; -- process the next column block
-
- movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
- movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
- movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
-
- pushpic ebx
- movpic ebx, POINTER [gotptr] ; load GOT address
-
- pxor mm3,mm3 ; mm3=(all 0's)
- movq mm4,mm0
- punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3)
- punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7)
- movq mm5,mm1
- punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3)
- punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7)
- movq mm6,mm2
- punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3)
- punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7)
-
- pmullw mm0,[GOTOFF(ebx,PW_THREE)]
- pmullw mm4,[GOTOFF(ebx,PW_THREE)]
-
- paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
- paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
- paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
- paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
-
- movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
- movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
- movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
- movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
-
- psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
- psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
-
- movq MMWORD [wk(2)], mm1
- movq MMWORD [wk(3)], mm2
-
-.upsample:
- ; -- process the upper row
-
- movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
- movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
-
- movq mm0,mm7
- movq mm4,mm3
- psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -)
- psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
- movq mm5,mm7
- movq mm6,mm3
- psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
- psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6)
-
- por mm0,mm4 ; mm0=( 1 2 3 4)
- por mm5,mm6 ; mm5=( 3 4 5 6)
-
- movq mm1,mm7
- movq mm2,mm3
- psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
- psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -)
- movq mm4,mm3
- psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
-
- por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
- por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
-
- movq MMWORD [wk(0)], mm4
-
- pmullw mm7,[GOTOFF(ebx,PW_THREE)]
- pmullw mm3,[GOTOFF(ebx,PW_THREE)]
- paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
- paddw mm5,[GOTOFF(ebx,PW_EIGHT)]
- paddw mm0,[GOTOFF(ebx,PW_SEVEN)]
- paddw mm2,[GOTOFF(ebx,PW_SEVEN)]
-
- paddw mm1,mm7
- paddw mm5,mm3
- psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6)
- psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14)
- paddw mm0,mm7
- paddw mm2,mm3
- psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7)
- psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15)
-
- psllw mm0,BYTE_BIT
- psllw mm2,BYTE_BIT
- por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
- por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
-
- movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
- movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
-
- ; -- process the lower row
-
- movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
- movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
-
- movq mm7,mm6
- movq mm3,mm4
- psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -)
- psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
- movq mm0,mm6
- movq mm2,mm4
- psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
- psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6)
-
- por mm7,mm3 ; mm7=( 1 2 3 4)
- por mm0,mm2 ; mm0=( 3 4 5 6)
-
- movq mm1,mm6
- movq mm5,mm4
- psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
- psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -)
- movq mm3,mm4
- psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
-
- por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
- por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
-
- movq MMWORD [wk(1)], mm3
-
- pmullw mm6,[GOTOFF(ebx,PW_THREE)]
- pmullw mm4,[GOTOFF(ebx,PW_THREE)]
- paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
- paddw mm0,[GOTOFF(ebx,PW_EIGHT)]
- paddw mm7,[GOTOFF(ebx,PW_SEVEN)]
- paddw mm5,[GOTOFF(ebx,PW_SEVEN)]
-
- paddw mm1,mm6
- paddw mm0,mm4
- psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6)
- psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14)
- paddw mm7,mm6
- paddw mm5,mm4
- psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7)
- psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15)
-
- psllw mm7,BYTE_BIT
- psllw mm5,BYTE_BIT
- por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
- por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
- movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
-
- poppic ebx
-
- sub eax, byte SIZEOF_MMWORD
- add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
- add ebx, byte 1*SIZEOF_MMWORD ; inptr0
- add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
- add edx, byte 2*SIZEOF_MMWORD ; outptr0
- add edi, byte 2*SIZEOF_MMWORD ; outptr1
- cmp eax, byte SIZEOF_MMWORD
- ja near .columnloop
- test eax,eax
- jnz near .columnloop_last
-
- pop esi
- pop edi
- pop ecx
- pop eax
-
- add esi, byte 1*SIZEOF_JSAMPROW ; input_data
- add edi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub ecx, byte 2 ; rowctr
- jg near .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
-; JDIMENSION output_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define output_width(b) (b)+12 ; JDIMENSION output_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
-
- align 16
- global EXTN(jsimd_h2v1_upsample_mmx)
-
-EXTN(jsimd_h2v1_upsample_mmx):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov edx, JDIMENSION [output_width(ebp)]
- add edx, byte (2*SIZEOF_MMWORD)-1
- and edx, byte -(2*SIZEOF_MMWORD)
- jz short .return
-
- mov ecx, INT [max_v_samp(ebp)] ; rowctr
- test ecx,ecx
- jz short .return
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, POINTER [output_data_ptr(ebp)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
-.rowloop:
- push edi
- push esi
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr
- mov eax,edx ; colctr
- alignx 16,7
-.columnloop:
-
- movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
- movq mm1,mm0
- punpcklbw mm0,mm0
- punpckhbw mm1,mm1
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
- movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
- sub eax, byte 2*SIZEOF_MMWORD
- jz short .nextrow
-
- movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
- movq mm3,mm2
- punpcklbw mm2,mm2
- punpckhbw mm3,mm3
-
- movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
- movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
- sub eax, byte 2*SIZEOF_MMWORD
- jz short .nextrow
-
- add esi, byte 2*SIZEOF_MMWORD ; inptr
- add edi, byte 4*SIZEOF_MMWORD ; outptr
- jmp short .columnloop
- alignx 16,7
-
-.nextrow:
- pop esi
- pop edi
-
- add esi, byte SIZEOF_JSAMPROW ; input_data
- add edi, byte SIZEOF_JSAMPROW ; output_data
- dec ecx ; rowctr
- jg short .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
-; pop ebx ; unused
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
-; JDIMENSION output_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define output_width(b) (b)+12 ; JDIMENSION output_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
-
- align 16
- global EXTN(jsimd_h2v2_upsample_mmx)
-
-EXTN(jsimd_h2v2_upsample_mmx):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov edx, JDIMENSION [output_width(ebp)]
- add edx, byte (2*SIZEOF_MMWORD)-1
- and edx, byte -(2*SIZEOF_MMWORD)
- jz near .return
-
- mov ecx, INT [max_v_samp(ebp)] ; rowctr
- test ecx,ecx
- jz short .return
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, POINTER [output_data_ptr(ebp)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
-.rowloop:
- push edi
- push esi
-
- mov esi, JSAMPROW [esi] ; inptr
- mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
- mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
- mov eax,edx ; colctr
- alignx 16,7
-.columnloop:
-
- movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
- movq mm1,mm0
- punpcklbw mm0,mm0
- punpckhbw mm1,mm1
-
- movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
- movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
- movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
- movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
- sub eax, byte 2*SIZEOF_MMWORD
- jz short .nextrow
-
- movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
- movq mm3,mm2
- punpcklbw mm2,mm2
- punpckhbw mm3,mm3
-
- movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
- movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
- movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
- movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
- sub eax, byte 2*SIZEOF_MMWORD
- jz short .nextrow
-
- add esi, byte 2*SIZEOF_MMWORD ; inptr
- add ebx, byte 4*SIZEOF_MMWORD ; outptr0
- add edi, byte 4*SIZEOF_MMWORD ; outptr1
- jmp short .columnloop
- alignx 16,7
-
-.nextrow:
- pop esi
- pop edi
-
- add esi, byte 1*SIZEOF_JSAMPROW ; input_data
- add edi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub ecx, byte 2 ; rowctr
- jg short .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jdsample-sse2-64.asm b/media/libjpeg/simd/jdsample-sse2-64.asm
deleted file mode 100644
index 1faaed648a..0000000000
--- a/media/libjpeg/simd/jdsample-sse2-64.asm
+++ /dev/null
@@ -1,670 +0,0 @@
-;
-; jdsample.asm - upsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_fancy_upsample_sse2)
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE times 8 dw 1
-PW_TWO times 8 dw 2
-PW_THREE times 8 dw 3
-PW_SEVEN times 8 dw 7
-PW_EIGHT times 8 dw 8
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter". This is a good compromise between
-; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-; JDIMENSION downsampled_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
- align 16
- global EXTN(jsimd_h2v1_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov eax, r11d ; colctr
- test rax,rax
- jz near .return
-
- mov rcx, r10 ; rowctr
- test rcx,rcx
- jz near .return
-
- mov rsi, r12 ; input_data
- mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
-.rowloop:
- push rax ; colctr
- push rdi
- push rsi
-
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
-
- test rax, SIZEOF_XMMWORD-1
- jz short .skip
- mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
-.skip:
- pxor xmm0,xmm0 ; xmm0=(all 0's)
- pcmpeqb xmm7,xmm7
- psrldq xmm7,(SIZEOF_XMMWORD-1)
- pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
- add rax, byte SIZEOF_XMMWORD-1
- and rax, byte -SIZEOF_XMMWORD
- cmp rax, byte SIZEOF_XMMWORD
- ja short .columnloop
-
-.columnloop_last:
- pcmpeqb xmm6,xmm6
- pslldq xmm6,(SIZEOF_XMMWORD-1)
- pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- jmp short .upsample
-
-.columnloop:
- movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- pslldq xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
- movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
- pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
- psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
-
- por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
- por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
-
- movdqa xmm7,xmm1
- psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
- movdqa xmm4,xmm1
- punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm2
- punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
- punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
- movdqa xmm6,xmm3
- punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
- punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
-
- pmullw xmm1,[rel PW_THREE]
- pmullw xmm4,[rel PW_THREE]
- paddw xmm2,[rel PW_ONE]
- paddw xmm5,[rel PW_ONE]
- paddw xmm3,[rel PW_TWO]
- paddw xmm6,[rel PW_TWO]
-
- paddw xmm2,xmm1
- paddw xmm5,xmm4
- psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
- psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
- paddw xmm3,xmm1
- paddw xmm6,xmm4
- psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
- psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
- psllw xmm3,BYTE_BIT
- psllw xmm6,BYTE_BIT
- por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
- por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
-
- sub rax, byte SIZEOF_XMMWORD
- add rsi, byte 1*SIZEOF_XMMWORD ; inptr
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- cmp rax, byte SIZEOF_XMMWORD
- ja near .columnloop
- test eax,eax
- jnz near .columnloop_last
-
- pop rsi
- pop rdi
- pop rax
-
- add rsi, byte SIZEOF_JSAMPROW ; input_data
- add rdi, byte SIZEOF_JSAMPROW ; output_data
- dec rcx ; rowctr
- jg near .rowloop
-
-.return:
- uncollect_args
- pop rbp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-; JDIMENSION downsampled_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 4
-
- align 16
- global EXTN(jsimd_h2v2_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov eax, r11d ; colctr
- test rax,rax
- jz near .return
-
- mov rcx, r10 ; rowctr
- test rcx,rcx
- jz near .return
-
- mov rsi, r12 ; input_data
- mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
-.rowloop:
- push rax ; colctr
- push rcx
- push rdi
- push rsi
-
- mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
- mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
- mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
-
- test rax, SIZEOF_XMMWORD-1
- jz short .skip
- push rdx
- mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
- pop rdx
-.skip:
- ; -- process the first column block
-
- movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
- movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
- movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
-
- pxor xmm3,xmm3 ; xmm3=(all 0's)
- movdqa xmm4,xmm0
- punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm1
- punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
- movdqa xmm6,xmm2
- punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
-
- pmullw xmm0,[rel PW_THREE]
- pmullw xmm4,[rel PW_THREE]
-
- pcmpeqb xmm7,xmm7
- psrldq xmm7,(SIZEOF_XMMWORD-2)
-
- paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
- paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
- paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
- paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
-
- movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
- movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
-
- pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
- pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
-
- movdqa XMMWORD [wk(0)], xmm1
- movdqa XMMWORD [wk(1)], xmm2
-
- add rax, byte SIZEOF_XMMWORD-1
- and rax, byte -SIZEOF_XMMWORD
- cmp rax, byte SIZEOF_XMMWORD
- ja short .columnloop
-
-.columnloop_last:
- ; -- process the last column block
-
- pcmpeqb xmm1,xmm1
- pslldq xmm1,(SIZEOF_XMMWORD-2)
- movdqa xmm2,xmm1
-
- pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
- pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
- movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
- movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
-
- jmp near .upsample
-
-.columnloop:
- ; -- process the next column block
-
- movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
- movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
- movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
-
- pxor xmm3,xmm3 ; xmm3=(all 0's)
- movdqa xmm4,xmm0
- punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm1
- punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
- movdqa xmm6,xmm2
- punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
-
- pmullw xmm0,[rel PW_THREE]
- pmullw xmm4,[rel PW_THREE]
-
- paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
- paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
- paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
- paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
-
- movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
- movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
- movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
-
- pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
- pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
-
- movdqa XMMWORD [wk(2)], xmm1
- movdqa XMMWORD [wk(3)], xmm2
-
-.upsample:
- ; -- process the upper row
-
- movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
- movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-
- movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
- movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
- psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
- pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
- movdqa xmm5,xmm7
- movdqa xmm6,xmm3
- psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
- pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
-
- por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
- por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
-
- movdqa xmm1,xmm7
- movdqa xmm2,xmm3
- pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
- psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
- movdqa xmm4,xmm3
- psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
- por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
- por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
-
- movdqa XMMWORD [wk(0)], xmm4
-
- pmullw xmm7,[rel PW_THREE]
- pmullw xmm3,[rel PW_THREE]
- paddw xmm1,[rel PW_EIGHT]
- paddw xmm5,[rel PW_EIGHT]
- paddw xmm0,[rel PW_SEVEN]
- paddw xmm2,[rel PW_SEVEN]
-
- paddw xmm1,xmm7
- paddw xmm5,xmm3
- psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
- psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
- paddw xmm0,xmm7
- paddw xmm2,xmm3
- psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
- psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
- psllw xmm0,BYTE_BIT
- psllw xmm2,BYTE_BIT
- por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
- por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
-
- ; -- process the lower row
-
- movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
- movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
- movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
- movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
- psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
- pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
- movdqa xmm0,xmm6
- movdqa xmm2,xmm4
- psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
- pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
-
- por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
- por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
-
- movdqa xmm1,xmm6
- movdqa xmm5,xmm4
- pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
- psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
- movdqa xmm3,xmm4
- psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
- por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
- por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
-
- movdqa XMMWORD [wk(1)], xmm3
-
- pmullw xmm6,[rel PW_THREE]
- pmullw xmm4,[rel PW_THREE]
- paddw xmm1,[rel PW_EIGHT]
- paddw xmm0,[rel PW_EIGHT]
- paddw xmm7,[rel PW_SEVEN]
- paddw xmm5,[rel PW_SEVEN]
-
- paddw xmm1,xmm6
- paddw xmm0,xmm4
- psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
- psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
- paddw xmm7,xmm6
- paddw xmm5,xmm4
- psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
- psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
- psllw xmm7,BYTE_BIT
- psllw xmm5,BYTE_BIT
- por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
- por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
-
- sub rax, byte SIZEOF_XMMWORD
- add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
- add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
- add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
- add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
- cmp rax, byte SIZEOF_XMMWORD
- ja near .columnloop
- test rax,rax
- jnz near .columnloop_last
-
- pop rsi
- pop rdi
- pop rcx
- pop rax
-
- add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
- add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub rcx, byte 2 ; rowctr
- jg near .rowloop
-
-.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-; JDIMENSION output_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
- align 16
- global EXTN(jsimd_h2v1_upsample_sse2)
-
-EXTN(jsimd_h2v1_upsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov edx, r11d
- add rdx, byte (2*SIZEOF_XMMWORD)-1
- and rdx, byte -(2*SIZEOF_XMMWORD)
- jz near .return
-
- mov rcx, r10 ; rowctr
- test rcx,rcx
- jz short .return
-
- mov rsi, r12 ; input_data
- mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
-.rowloop:
- push rdi
- push rsi
-
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
- mov rax,rdx ; colctr
-.columnloop:
-
- movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
- movdqa xmm1,xmm0
- punpcklbw xmm0,xmm0
- punpckhbw xmm1,xmm1
-
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
- sub rax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
-
- movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
- movdqa xmm3,xmm2
- punpcklbw xmm2,xmm2
- punpckhbw xmm3,xmm3
-
- movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
- sub rax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
-
- add rsi, byte 2*SIZEOF_XMMWORD ; inptr
- add rdi, byte 4*SIZEOF_XMMWORD ; outptr
- jmp short .columnloop
-
-.nextrow:
- pop rsi
- pop rdi
-
- add rsi, byte SIZEOF_JSAMPROW ; input_data
- add rdi, byte SIZEOF_JSAMPROW ; output_data
- dec rcx ; rowctr
- jg short .rowloop
-
-.return:
- uncollect_args
- pop rbp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-; JDIMENSION output_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
- align 16
- global EXTN(jsimd_h2v2_upsample_sse2)
-
-EXTN(jsimd_h2v2_upsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- mov edx, r11d
- add rdx, byte (2*SIZEOF_XMMWORD)-1
- and rdx, byte -(2*SIZEOF_XMMWORD)
- jz near .return
-
- mov rcx, r10 ; rowctr
- test rcx,rcx
- jz near .return
-
- mov rsi, r12 ; input_data
- mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
-.rowloop:
- push rdi
- push rsi
-
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
- mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
- mov rax,rdx ; colctr
-.columnloop:
-
- movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
- movdqa xmm1,xmm0
- punpcklbw xmm0,xmm0
- punpckhbw xmm1,xmm1
-
- movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
- sub rax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
-
- movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
- movdqa xmm3,xmm2
- punpcklbw xmm2,xmm2
- punpckhbw xmm3,xmm3
-
- movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
- movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
- sub rax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
-
- add rsi, byte 2*SIZEOF_XMMWORD ; inptr
- add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
- add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
- jmp short .columnloop
-
-.nextrow:
- pop rsi
- pop rdi
-
- add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
- add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub rcx, byte 2 ; rowctr
- jg near .rowloop
-
-.return:
- pop rbx
- uncollect_args
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jdsample-sse2.asm b/media/libjpeg/simd/jdsample-sse2.asm
deleted file mode 100644
index 1d0059e803..0000000000
--- a/media/libjpeg/simd/jdsample-sse2.asm
+++ /dev/null
@@ -1,728 +0,0 @@
-;
-; jdsample.asm - upsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_fancy_upsample_sse2)
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE times 8 dw 1
-PW_TWO times 8 dw 2
-PW_THREE times 8 dw 3
-PW_SEVEN times 8 dw 7
-PW_EIGHT times 8 dw 8
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter". This is a good compromise between
-; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-; JDIMENSION downsampled_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
-
- align 16
- global EXTN(jsimd_h2v1_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
- push ebp
- mov ebp,esp
- pushpic ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
- test eax,eax
- jz near .return
-
- mov ecx, INT [max_v_samp(ebp)] ; rowctr
- test ecx,ecx
- jz near .return
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, POINTER [output_data_ptr(ebp)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
-.rowloop:
- push eax ; colctr
- push edi
- push esi
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr
-
- test eax, SIZEOF_XMMWORD-1
- jz short .skip
- mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
-.skip:
- pxor xmm0,xmm0 ; xmm0=(all 0's)
- pcmpeqb xmm7,xmm7
- psrldq xmm7,(SIZEOF_XMMWORD-1)
- pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
- add eax, byte SIZEOF_XMMWORD-1
- and eax, byte -SIZEOF_XMMWORD
- cmp eax, byte SIZEOF_XMMWORD
- ja short .columnloop
- alignx 16,7
-
-.columnloop_last:
- pcmpeqb xmm6,xmm6
- pslldq xmm6,(SIZEOF_XMMWORD-1)
- pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
- jmp short .upsample
- alignx 16,7
-
-.columnloop:
- movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
- pslldq xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
- movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
- pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
- psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
-
- por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
- por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
-
- movdqa xmm7,xmm1
- psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
- movdqa xmm4,xmm1
- punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm2
- punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
- punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
- movdqa xmm6,xmm3
- punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
- punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
-
- pmullw xmm1,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
- paddw xmm2,[GOTOFF(ebx,PW_ONE)]
- paddw xmm5,[GOTOFF(ebx,PW_ONE)]
- paddw xmm3,[GOTOFF(ebx,PW_TWO)]
- paddw xmm6,[GOTOFF(ebx,PW_TWO)]
-
- paddw xmm2,xmm1
- paddw xmm5,xmm4
- psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
- psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
- paddw xmm3,xmm1
- paddw xmm6,xmm4
- psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
- psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
- psllw xmm3,BYTE_BIT
- psllw xmm6,BYTE_BIT
- por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
- por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
-
- sub eax, byte SIZEOF_XMMWORD
- add esi, byte 1*SIZEOF_XMMWORD ; inptr
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- cmp eax, byte SIZEOF_XMMWORD
- ja near .columnloop
- test eax,eax
- jnz near .columnloop_last
-
- pop esi
- pop edi
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_data
- add edi, byte SIZEOF_JSAMPROW ; output_data
- dec ecx ; rowctr
- jg near .rowloop
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- poppic ebx
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-; JDIMENSION downsampled_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 4
-%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr
-
- align 16
- global EXTN(jsimd_h2v2_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov edx,eax ; edx = original ebp
- mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
- test eax,eax
- jz near .return
-
- mov ecx, INT [max_v_samp(edx)] ; rowctr
- test ecx,ecx
- jz near .return
-
- mov esi, JSAMPARRAY [input_data(edx)] ; input_data
- mov edi, POINTER [output_data_ptr(edx)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
-.rowloop:
- push eax ; colctr
- push ecx
- push edi
- push esi
-
- mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
- mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
- mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
-
- test eax, SIZEOF_XMMWORD-1
- jz short .skip
- push edx
- mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
- pop edx
-.skip:
- ; -- process the first column block
-
- movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
- movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
- movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
-
- pushpic ebx
- movpic ebx, POINTER [gotptr] ; load GOT address
-
- pxor xmm3,xmm3 ; xmm3=(all 0's)
- movdqa xmm4,xmm0
- punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm1
- punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
- movdqa xmm6,xmm2
- punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
-
- pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
-
- pcmpeqb xmm7,xmm7
- psrldq xmm7,(SIZEOF_XMMWORD-2)
-
- paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
- paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
- paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
- paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
-
- movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
- movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
-
- pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
- pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
-
- movdqa XMMWORD [wk(0)], xmm1
- movdqa XMMWORD [wk(1)], xmm2
-
- poppic ebx
-
- add eax, byte SIZEOF_XMMWORD-1
- and eax, byte -SIZEOF_XMMWORD
- cmp eax, byte SIZEOF_XMMWORD
- ja short .columnloop
- alignx 16,7
-
-.columnloop_last:
- ; -- process the last column block
-
- pushpic ebx
- movpic ebx, POINTER [gotptr] ; load GOT address
-
- pcmpeqb xmm1,xmm1
- pslldq xmm1,(SIZEOF_XMMWORD-2)
- movdqa xmm2,xmm1
-
- pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
- pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
- movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
- movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
-
- jmp near .upsample
- alignx 16,7
-
-.columnloop:
- ; -- process the next column block
-
- movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
- movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
- movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
-
- pushpic ebx
- movpic ebx, POINTER [gotptr] ; load GOT address
-
- pxor xmm3,xmm3 ; xmm3=(all 0's)
- movdqa xmm4,xmm0
- punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm1
- punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
- movdqa xmm6,xmm2
- punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
-
- pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
-
- paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
- paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
- paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
- paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
-
- movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
- movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
- movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
-
- pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
- pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
-
- movdqa XMMWORD [wk(2)], xmm1
- movdqa XMMWORD [wk(3)], xmm2
-
-.upsample:
- ; -- process the upper row
-
- movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
- movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
-
- movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
- movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
- psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
- pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
- movdqa xmm5,xmm7
- movdqa xmm6,xmm3
- psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
- pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
-
- por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
- por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
-
- movdqa xmm1,xmm7
- movdqa xmm2,xmm3
- pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
- psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
- movdqa xmm4,xmm3
- psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
- por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
- por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
-
- movdqa XMMWORD [wk(0)], xmm4
-
- pmullw xmm7,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm3,[GOTOFF(ebx,PW_THREE)]
- paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
- paddw xmm5,[GOTOFF(ebx,PW_EIGHT)]
- paddw xmm0,[GOTOFF(ebx,PW_SEVEN)]
- paddw xmm2,[GOTOFF(ebx,PW_SEVEN)]
-
- paddw xmm1,xmm7
- paddw xmm5,xmm3
- psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
- psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
- paddw xmm0,xmm7
- paddw xmm2,xmm3
- psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
- psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
- psllw xmm0,BYTE_BIT
- psllw xmm2,BYTE_BIT
- por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
- por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
-
- ; -- process the lower row
-
- movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
- movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
- movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
- movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
- psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
- pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
- movdqa xmm0,xmm6
- movdqa xmm2,xmm4
- psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
- pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
-
- por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
- por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
-
- movdqa xmm1,xmm6
- movdqa xmm5,xmm4
- pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
- psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
- movdqa xmm3,xmm4
- psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
- por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
- por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
-
- movdqa XMMWORD [wk(1)], xmm3
-
- pmullw xmm6,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
- paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
- paddw xmm0,[GOTOFF(ebx,PW_EIGHT)]
- paddw xmm7,[GOTOFF(ebx,PW_SEVEN)]
- paddw xmm5,[GOTOFF(ebx,PW_SEVEN)]
-
- paddw xmm1,xmm6
- paddw xmm0,xmm4
- psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
- psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
- paddw xmm7,xmm6
- paddw xmm5,xmm4
- psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
- psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
- psllw xmm7,BYTE_BIT
- psllw xmm5,BYTE_BIT
- por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
- por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
-
- poppic ebx
-
- sub eax, byte SIZEOF_XMMWORD
- add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
- add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
- add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
- add edx, byte 2*SIZEOF_XMMWORD ; outptr0
- add edi, byte 2*SIZEOF_XMMWORD ; outptr1
- cmp eax, byte SIZEOF_XMMWORD
- ja near .columnloop
- test eax,eax
- jnz near .columnloop_last
-
- pop esi
- pop edi
- pop ecx
- pop eax
-
- add esi, byte 1*SIZEOF_JSAMPROW ; input_data
- add edi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub ecx, byte 2 ; rowctr
- jg near .rowloop
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-; JDIMENSION output_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define output_width(b) (b)+12 ; JDIMENSION output_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
-
- align 16
- global EXTN(jsimd_h2v1_upsample_sse2)
-
-EXTN(jsimd_h2v1_upsample_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov edx, JDIMENSION [output_width(ebp)]
- add edx, byte (2*SIZEOF_XMMWORD)-1
- and edx, byte -(2*SIZEOF_XMMWORD)
- jz short .return
-
- mov ecx, INT [max_v_samp(ebp)] ; rowctr
- test ecx,ecx
- jz short .return
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, POINTER [output_data_ptr(ebp)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
-.rowloop:
- push edi
- push esi
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr
- mov eax,edx ; colctr
- alignx 16,7
-.columnloop:
-
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
- movdqa xmm1,xmm0
- punpcklbw xmm0,xmm0
- punpckhbw xmm1,xmm1
-
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
- sub eax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
-
- movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
- movdqa xmm3,xmm2
- punpcklbw xmm2,xmm2
- punpckhbw xmm3,xmm3
-
- movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
- sub eax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
-
- add esi, byte 2*SIZEOF_XMMWORD ; inptr
- add edi, byte 4*SIZEOF_XMMWORD ; outptr
- jmp short .columnloop
- alignx 16,7
-
-.nextrow:
- pop esi
- pop edi
-
- add esi, byte SIZEOF_JSAMPROW ; input_data
- add edi, byte SIZEOF_JSAMPROW ; output_data
- dec ecx ; rowctr
- jg short .rowloop
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
-; pop ebx ; unused
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-; JDIMENSION output_width,
-; JSAMPARRAY input_data,
-; JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define output_width(b) (b)+12 ; JDIMENSION output_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
-
- align 16
- global EXTN(jsimd_h2v2_upsample_sse2)
-
-EXTN(jsimd_h2v2_upsample_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov edx, JDIMENSION [output_width(ebp)]
- add edx, byte (2*SIZEOF_XMMWORD)-1
- and edx, byte -(2*SIZEOF_XMMWORD)
- jz near .return
-
- mov ecx, INT [max_v_samp(ebp)] ; rowctr
- test ecx,ecx
- jz near .return
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, POINTER [output_data_ptr(ebp)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
-.rowloop:
- push edi
- push esi
-
- mov esi, JSAMPROW [esi] ; inptr
- mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
- mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
- mov eax,edx ; colctr
- alignx 16,7
-.columnloop:
-
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
- movdqa xmm1,xmm0
- punpcklbw xmm0,xmm0
- punpckhbw xmm1,xmm1
-
- movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
- sub eax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
-
- movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
- movdqa xmm3,xmm2
- punpcklbw xmm2,xmm2
- punpckhbw xmm3,xmm3
-
- movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
- movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
- sub eax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
-
- add esi, byte 2*SIZEOF_XMMWORD ; inptr
- add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
- add edi, byte 4*SIZEOF_XMMWORD ; outptr1
- jmp short .columnloop
- alignx 16,7
-
-.nextrow:
- pop esi
- pop edi
-
- add esi, byte 1*SIZEOF_JSAMPROW ; input_data
- add edi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub ecx, byte 2 ; rowctr
- jg short .rowloop
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jfdctflt-3dn.asm b/media/libjpeg/simd/jfdctflt-3dn.asm
deleted file mode 100644
index 219161819a..0000000000
--- a/media/libjpeg/simd/jfdctflt-3dn.asm
+++ /dev/null
@@ -1,319 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (3DNow!)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_fdct_float_3dnow)
-
-EXTN(jconst_fdct_float_3dnow):
-
-PD_0_382 times 2 dd 0.382683432365089771728460
-PD_0_707 times 2 dd 0.707106781186547524400844
-PD_0_541 times 2 dd 0.541196100146196984399723
-PD_1_306 times 2 dd 1.306562964876376527856643
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_3dnow (FAST_FLOAT *data)
-;
-
-%define data(b) (b)+8 ; FAST_FLOAT *data
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_fdct_float_3dnow)
-
-EXTN(jsimd_fdct_float_3dnow):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
-; push esi ; unused
-; push edi ; unused
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process rows.
-
- mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
- mov ecx, DCTSIZE/2
- alignx 16,7
-.rowloop:
-
- movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
- movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
- ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
-
- movq mm4,mm0 ; transpose coefficients
- punpckldq mm0,mm1 ; mm0=(00 10)=data0
- punpckhdq mm4,mm1 ; mm4=(01 11)=data1
- movq mm5,mm2 ; transpose coefficients
- punpckldq mm2,mm3 ; mm2=(06 16)=data6
- punpckhdq mm5,mm3 ; mm5=(07 17)=data7
-
- movq mm6,mm4
- movq mm7,mm0
- pfsub mm4,mm2 ; mm4=data1-data6=tmp6
- pfsub mm0,mm5 ; mm0=data0-data7=tmp7
- pfadd mm6,mm2 ; mm6=data1+data6=tmp1
- pfadd mm7,mm5 ; mm7=data0+data7=tmp0
-
- movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
- movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
- movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
- movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-
- ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
- movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
-
- movq mm4,mm1 ; transpose coefficients
- punpckldq mm1,mm3 ; mm1=(02 12)=data2
- punpckhdq mm4,mm3 ; mm4=(03 13)=data3
- movq mm0,mm2 ; transpose coefficients
- punpckldq mm2,mm5 ; mm2=(04 14)=data4
- punpckhdq mm0,mm5 ; mm0=(05 15)=data5
-
- movq mm3,mm4
- movq mm5,mm1
- pfadd mm4,mm2 ; mm4=data3+data4=tmp3
- pfadd mm1,mm0 ; mm1=data2+data5=tmp2
- pfsub mm3,mm2 ; mm3=data3-data4=tmp4
- pfsub mm5,mm0 ; mm5=data2-data5=tmp5
-
- ; -- Even part
-
- movq mm2,mm7
- movq mm0,mm6
- pfsub mm7,mm4 ; mm7=tmp13
- pfsub mm6,mm1 ; mm6=tmp12
- pfadd mm2,mm4 ; mm2=tmp10
- pfadd mm0,mm1 ; mm0=tmp11
-
- pfadd mm6,mm7
- pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
- movq mm4,mm2
- movq mm1,mm7
- pfsub mm2,mm0 ; mm2=data4
- pfsub mm7,mm6 ; mm7=data6
- pfadd mm4,mm0 ; mm4=data0
- pfadd mm1,mm6 ; mm1=data2
-
- movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
- movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
- movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
- movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
-
- ; -- Odd part
-
- movq mm0, MMWORD [wk(0)] ; mm0=tmp6
- movq mm6, MMWORD [wk(1)] ; mm6=tmp7
-
- pfadd mm3,mm5 ; mm3=tmp10
- pfadd mm5,mm0 ; mm5=tmp11
- pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7
-
- pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
- movq mm2,mm3 ; mm2=tmp10
- pfsub mm3,mm0
- pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
- pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
- pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
- pfadd mm2,mm3 ; mm2=z2
- pfadd mm0,mm3 ; mm0=z4
-
- movq mm7,mm6
- pfsub mm6,mm5 ; mm6=z13
- pfadd mm7,mm5 ; mm7=z11
-
- movq mm4,mm6
- movq mm1,mm7
- pfsub mm6,mm2 ; mm6=data3
- pfsub mm7,mm0 ; mm7=data7
- pfadd mm4,mm2 ; mm4=data5
- pfadd mm1,mm0 ; mm1=data1
-
- movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
- movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
- movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
- movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
- add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
- dec ecx
- jnz near .rowloop
-
- ; ---- Pass 2: process columns.
-
- mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
- mov ecx, DCTSIZE/2
- alignx 16,7
-.columnloop:
-
- movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
- movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
- ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
-
- movq mm4,mm0 ; transpose coefficients
- punpckldq mm0,mm1 ; mm0=(00 01)=data0
- punpckhdq mm4,mm1 ; mm4=(10 11)=data1
- movq mm5,mm2 ; transpose coefficients
- punpckldq mm2,mm3 ; mm2=(60 61)=data6
- punpckhdq mm5,mm3 ; mm5=(70 71)=data7
-
- movq mm6,mm4
- movq mm7,mm0
- pfsub mm4,mm2 ; mm4=data1-data6=tmp6
- pfsub mm0,mm5 ; mm0=data0-data7=tmp7
- pfadd mm6,mm2 ; mm6=data1+data6=tmp1
- pfadd mm7,mm5 ; mm7=data0+data7=tmp0
-
- movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
- movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
- movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
- movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
- ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
- movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
-
- movq mm4,mm1 ; transpose coefficients
- punpckldq mm1,mm3 ; mm1=(20 21)=data2
- punpckhdq mm4,mm3 ; mm4=(30 31)=data3
- movq mm0,mm2 ; transpose coefficients
- punpckldq mm2,mm5 ; mm2=(40 41)=data4
- punpckhdq mm0,mm5 ; mm0=(50 51)=data5
-
- movq mm3,mm4
- movq mm5,mm1
- pfadd mm4,mm2 ; mm4=data3+data4=tmp3
- pfadd mm1,mm0 ; mm1=data2+data5=tmp2
- pfsub mm3,mm2 ; mm3=data3-data4=tmp4
- pfsub mm5,mm0 ; mm5=data2-data5=tmp5
-
- ; -- Even part
-
- movq mm2,mm7
- movq mm0,mm6
- pfsub mm7,mm4 ; mm7=tmp13
- pfsub mm6,mm1 ; mm6=tmp12
- pfadd mm2,mm4 ; mm2=tmp10
- pfadd mm0,mm1 ; mm0=tmp11
-
- pfadd mm6,mm7
- pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
- movq mm4,mm2
- movq mm1,mm7
- pfsub mm2,mm0 ; mm2=data4
- pfsub mm7,mm6 ; mm7=data6
- pfadd mm4,mm0 ; mm4=data0
- pfadd mm1,mm6 ; mm1=data2
-
- movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
- movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
- movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
- movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
- ; -- Odd part
-
- movq mm0, MMWORD [wk(0)] ; mm0=tmp6
- movq mm6, MMWORD [wk(1)] ; mm6=tmp7
-
- pfadd mm3,mm5 ; mm3=tmp10
- pfadd mm5,mm0 ; mm5=tmp11
- pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7
-
- pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
- movq mm2,mm3 ; mm2=tmp10
- pfsub mm3,mm0
- pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
- pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
- pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
- pfadd mm2,mm3 ; mm2=z2
- pfadd mm0,mm3 ; mm0=z4
-
- movq mm7,mm6
- pfsub mm6,mm5 ; mm6=z13
- pfadd mm7,mm5 ; mm7=z11
-
- movq mm4,mm6
- movq mm1,mm7
- pfsub mm6,mm2 ; mm6=data3
- pfsub mm7,mm0 ; mm7=data7
- pfadd mm4,mm2 ; mm4=data5
- pfadd mm1,mm0 ; mm1=data1
-
- movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
- movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
- movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
- movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
- add edx, byte 2*SIZEOF_FAST_FLOAT
- dec ecx
- jnz near .columnloop
-
- femms ; empty MMX/3DNow! state
-
-; pop edi ; unused
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jfdctflt-sse-64.asm b/media/libjpeg/simd/jfdctflt-sse-64.asm
deleted file mode 100644
index 4b64ea4bb5..0000000000
--- a/media/libjpeg/simd/jfdctflt-sse-64.asm
+++ /dev/null
@@ -1,357 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (64-bit SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
- shufps %1,%2,0x44
-%endmacro
-
-%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
- shufps %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_fdct_float_sse)
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382 times 4 dd 0.382683432365089771728460
-PD_0_707 times 4 dd 0.707106781186547524400844
-PD_0_541 times 4 dd 0.541196100146196984399723
-PD_1_306 times 4 dd 1.306562964876376527856643
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT *data)
-;
-
-; r10 = FAST_FLOAT *data
-
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_fdct_float_sse)
-
-EXTN(jsimd_fdct_float_sse):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
-
- ; ---- Pass 1: process rows.
-
- mov rdx, r10 ; (FAST_FLOAT *)
- mov rcx, DCTSIZE/4
-.rowloop:
-
- movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
-
- ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
- ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
- movaps xmm4,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
- unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
- movaps xmm5,xmm2 ; transpose coefficients(phase 1)
- unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
- unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
-
- movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
- ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
- ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
- movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
- movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
-
- movaps xmm4,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
- unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
- movaps xmm2,xmm1 ; transpose coefficients(phase 1)
- unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
- unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
-
- movaps xmm7,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
- unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
- movaps xmm3,xmm2 ; transpose coefficients(phase 2)
- unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
- unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
-
- movaps xmm0,xmm7
- movaps xmm5,xmm6
- subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
- subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
- addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
- addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
-
- movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
- movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
- movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movaps xmm7,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
- unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
- movaps xmm6,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
- unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
-
- movaps xmm2,xmm7
- movaps xmm3,xmm4
- addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
- addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
- subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
- subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movaps xmm1,xmm5
- movaps xmm6,xmm0
- subps xmm5,xmm7 ; xmm5=tmp13
- subps xmm0,xmm4 ; xmm0=tmp12
- addps xmm1,xmm7 ; xmm1=tmp10
- addps xmm6,xmm4 ; xmm6=tmp11
-
- addps xmm0,xmm5
- mulps xmm0,[rel PD_0_707] ; xmm0=z1
-
- movaps xmm7,xmm1
- movaps xmm4,xmm5
- subps xmm1,xmm6 ; xmm1=data4
- subps xmm5,xmm0 ; xmm5=data6
- addps xmm7,xmm6 ; xmm7=data0
- addps xmm4,xmm0 ; xmm4=data2
-
- movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
- ; -- Odd part
-
- movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
- movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
-
- addps xmm2,xmm3 ; xmm2=tmp10
- addps xmm3,xmm6 ; xmm3=tmp11
- addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
-
- mulps xmm3,[rel PD_0_707] ; xmm3=z3
-
- movaps xmm1,xmm2 ; xmm1=tmp10
- subps xmm2,xmm6
- mulps xmm2,[rel PD_0_382] ; xmm2=z5
- mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
- mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
- addps xmm1,xmm2 ; xmm1=z2
- addps xmm6,xmm2 ; xmm6=z4
-
- movaps xmm5,xmm0
- subps xmm0,xmm3 ; xmm0=z13
- addps xmm5,xmm3 ; xmm5=z11
-
- movaps xmm7,xmm0
- movaps xmm4,xmm5
- subps xmm0,xmm1 ; xmm0=data3
- subps xmm5,xmm6 ; xmm5=data7
- addps xmm7,xmm1 ; xmm7=data5
- addps xmm4,xmm6 ; xmm4=data1
-
- movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
- add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
- dec rcx
- jnz near .rowloop
-
- ; ---- Pass 2: process columns.
-
- mov rdx, r10 ; (FAST_FLOAT *)
- mov rcx, DCTSIZE/4
-.columnloop:
-
- movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
-
- ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
- ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
- movaps xmm4,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
- unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
- movaps xmm5,xmm2 ; transpose coefficients(phase 1)
- unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
- unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
-
- movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
-
- ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
- ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
- movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
- movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
-
- movaps xmm4,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
- unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
- movaps xmm2,xmm1 ; transpose coefficients(phase 1)
- unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
- unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
-
- movaps xmm7,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
- unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
- movaps xmm3,xmm2 ; transpose coefficients(phase 2)
- unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
- unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
-
- movaps xmm0,xmm7
- movaps xmm5,xmm6
- subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
- subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
- addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
- addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
-
- movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
- movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
- movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movaps xmm7,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
- unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
- movaps xmm6,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
- unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
-
- movaps xmm2,xmm7
- movaps xmm3,xmm4
- addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
- addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
- subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
- subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movaps xmm1,xmm5
- movaps xmm6,xmm0
- subps xmm5,xmm7 ; xmm5=tmp13
- subps xmm0,xmm4 ; xmm0=tmp12
- addps xmm1,xmm7 ; xmm1=tmp10
- addps xmm6,xmm4 ; xmm6=tmp11
-
- addps xmm0,xmm5
- mulps xmm0,[rel PD_0_707] ; xmm0=z1
-
- movaps xmm7,xmm1
- movaps xmm4,xmm5
- subps xmm1,xmm6 ; xmm1=data4
- subps xmm5,xmm0 ; xmm5=data6
- addps xmm7,xmm6 ; xmm7=data0
- addps xmm4,xmm0 ; xmm4=data2
-
- movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
- ; -- Odd part
-
- movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
- movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
-
- addps xmm2,xmm3 ; xmm2=tmp10
- addps xmm3,xmm6 ; xmm3=tmp11
- addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
-
- mulps xmm3,[rel PD_0_707] ; xmm3=z3
-
- movaps xmm1,xmm2 ; xmm1=tmp10
- subps xmm2,xmm6
- mulps xmm2,[rel PD_0_382] ; xmm2=z5
- mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
- mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
- addps xmm1,xmm2 ; xmm1=z2
- addps xmm6,xmm2 ; xmm6=z4
-
- movaps xmm5,xmm0
- subps xmm0,xmm3 ; xmm0=z13
- addps xmm5,xmm3 ; xmm5=z11
-
- movaps xmm7,xmm0
- movaps xmm4,xmm5
- subps xmm0,xmm1 ; xmm0=data3
- subps xmm5,xmm6 ; xmm5=data7
- addps xmm7,xmm1 ; xmm7=data5
- addps xmm4,xmm6 ; xmm4=data1
-
- movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
- add rdx, byte 4*SIZEOF_FAST_FLOAT
- dec rcx
- jnz near .columnloop
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jfdctflt-sse.asm b/media/libjpeg/simd/jfdctflt-sse.asm
deleted file mode 100644
index e7ede26c0c..0000000000
--- a/media/libjpeg/simd/jfdctflt-sse.asm
+++ /dev/null
@@ -1,369 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
- shufps %1,%2,0x44
-%endmacro
-
-%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
- shufps %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_fdct_float_sse)
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382 times 4 dd 0.382683432365089771728460
-PD_0_707 times 4 dd 0.707106781186547524400844
-PD_0_541 times 4 dd 0.541196100146196984399723
-PD_1_306 times 4 dd 1.306562964876376527856643
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT *data)
-;
-
-%define data(b) (b)+8 ; FAST_FLOAT *data
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_fdct_float_sse)
-
-EXTN(jsimd_fdct_float_sse):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
-; push esi ; unused
-; push edi ; unused
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process rows.
-
- mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
- mov ecx, DCTSIZE/4
- alignx 16,7
-.rowloop:
-
- movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
-
- ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
- ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
- movaps xmm4,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
- unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
- movaps xmm5,xmm2 ; transpose coefficients(phase 1)
- unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
- unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
-
- movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
- ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
- ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
- movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
- movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
-
- movaps xmm4,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
- unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
- movaps xmm2,xmm1 ; transpose coefficients(phase 1)
- unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
- unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
-
- movaps xmm7,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
- unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
- movaps xmm3,xmm2 ; transpose coefficients(phase 2)
- unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
- unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
-
- movaps xmm0,xmm7
- movaps xmm5,xmm6
- subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
- subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
- addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
- addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
-
- movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
- movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
- movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movaps xmm7,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
- unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
- movaps xmm6,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
- unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
-
- movaps xmm2,xmm7
- movaps xmm3,xmm4
- addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
- addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
- subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
- subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movaps xmm1,xmm5
- movaps xmm6,xmm0
- subps xmm5,xmm7 ; xmm5=tmp13
- subps xmm0,xmm4 ; xmm0=tmp12
- addps xmm1,xmm7 ; xmm1=tmp10
- addps xmm6,xmm4 ; xmm6=tmp11
-
- addps xmm0,xmm5
- mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
- movaps xmm7,xmm1
- movaps xmm4,xmm5
- subps xmm1,xmm6 ; xmm1=data4
- subps xmm5,xmm0 ; xmm5=data6
- addps xmm7,xmm6 ; xmm7=data0
- addps xmm4,xmm0 ; xmm4=data2
-
- movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
- ; -- Odd part
-
- movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
- movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
-
- addps xmm2,xmm3 ; xmm2=tmp10
- addps xmm3,xmm6 ; xmm3=tmp11
- addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
-
- mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
- movaps xmm1,xmm2 ; xmm1=tmp10
- subps xmm2,xmm6
- mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
- mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
- mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
- addps xmm1,xmm2 ; xmm1=z2
- addps xmm6,xmm2 ; xmm6=z4
-
- movaps xmm5,xmm0
- subps xmm0,xmm3 ; xmm0=z13
- addps xmm5,xmm3 ; xmm5=z11
-
- movaps xmm7,xmm0
- movaps xmm4,xmm5
- subps xmm0,xmm1 ; xmm0=data3
- subps xmm5,xmm6 ; xmm5=data7
- addps xmm7,xmm1 ; xmm7=data5
- addps xmm4,xmm6 ; xmm4=data1
-
- movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
- add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
- dec ecx
- jnz near .rowloop
-
- ; ---- Pass 2: process columns.
-
- mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
- mov ecx, DCTSIZE/4
- alignx 16,7
-.columnloop:
-
- movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
- ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
- ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
- movaps xmm4,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
- unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
- movaps xmm5,xmm2 ; transpose coefficients(phase 1)
- unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
- unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
-
- movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
- ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
- ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
- movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
- movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
-
- movaps xmm4,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
- unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
- movaps xmm2,xmm1 ; transpose coefficients(phase 1)
- unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
- unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
-
- movaps xmm7,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
- unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
- movaps xmm3,xmm2 ; transpose coefficients(phase 2)
- unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
- unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
-
- movaps xmm0,xmm7
- movaps xmm5,xmm6
- subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
- subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
- addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
- addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
-
- movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
- movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
- movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movaps xmm7,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
- unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
- movaps xmm6,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
- unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
-
- movaps xmm2,xmm7
- movaps xmm3,xmm4
- addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
- addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
- subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
- subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movaps xmm1,xmm5
- movaps xmm6,xmm0
- subps xmm5,xmm7 ; xmm5=tmp13
- subps xmm0,xmm4 ; xmm0=tmp12
- addps xmm1,xmm7 ; xmm1=tmp10
- addps xmm6,xmm4 ; xmm6=tmp11
-
- addps xmm0,xmm5
- mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
- movaps xmm7,xmm1
- movaps xmm4,xmm5
- subps xmm1,xmm6 ; xmm1=data4
- subps xmm5,xmm0 ; xmm5=data6
- addps xmm7,xmm6 ; xmm7=data0
- addps xmm4,xmm0 ; xmm4=data2
-
- movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
- ; -- Odd part
-
- movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
- movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
-
- addps xmm2,xmm3 ; xmm2=tmp10
- addps xmm3,xmm6 ; xmm3=tmp11
- addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
-
- mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
- movaps xmm1,xmm2 ; xmm1=tmp10
- subps xmm2,xmm6
- mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
- mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
- mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
- addps xmm1,xmm2 ; xmm1=z2
- addps xmm6,xmm2 ; xmm6=z4
-
- movaps xmm5,xmm0
- subps xmm0,xmm3 ; xmm0=z13
- addps xmm5,xmm3 ; xmm5=z11
-
- movaps xmm7,xmm0
- movaps xmm4,xmm5
- subps xmm0,xmm1 ; xmm0=data3
- subps xmm5,xmm6 ; xmm5=data7
- addps xmm7,xmm1 ; xmm7=data5
- addps xmm4,xmm6 ; xmm4=data1
-
- movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
- add edx, byte 4*SIZEOF_FAST_FLOAT
- dec ecx
- jnz near .columnloop
-
-; pop edi ; unused
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jfdctfst-mmx.asm b/media/libjpeg/simd/jfdctfst-mmx.asm
deleted file mode 100644
index eb2eb9c50d..0000000000
--- a/media/libjpeg/simd/jfdctfst-mmx.asm
+++ /dev/null
@@ -1,396 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 8 ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ 98 ; FIX(0.382683433)
-F_0_541 equ 139 ; FIX(0.541196100)
-F_0_707 equ 181 ; FIX(0.707106781)
-F_1_306 equ 334 ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
-F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
- alignz 16
- global EXTN(jconst_fdct_ifast_mmx)
-
-EXTN(jconst_fdct_ifast_mmx):
-
-PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
-PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
-PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
-PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_mmx (DCTELEM *data)
-;
-
-%define data(b) (b)+8 ; DCTELEM *data
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_fdct_ifast_mmx)
-
-EXTN(jsimd_fdct_ifast_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
-; push esi ; unused
-; push edi ; unused
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process rows.
-
- mov edx, POINTER [data(eax)] ; (DCTELEM *)
- mov ecx, DCTSIZE/4
- alignx 16,7
-.rowloop:
-
- movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
- movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
- movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
- ; mm0=(20 21 22 23), mm2=(24 25 26 27)
- ; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
- movq mm4,mm0 ; transpose coefficients(phase 1)
- punpcklwd mm0,mm1 ; mm0=(20 30 21 31)
- punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
- movq mm5,mm2 ; transpose coefficients(phase 1)
- punpcklwd mm2,mm3 ; mm2=(24 34 25 35)
- punpckhwd mm5,mm3 ; mm5=(26 36 27 37)
-
- movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
- movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
- movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
- ; mm6=(00 01 02 03), mm1=(04 05 06 07)
- ; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
- movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
-
- movq mm4,mm6 ; transpose coefficients(phase 1)
- punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
- punpckhwd mm4,mm7 ; mm4=(02 12 03 13)
- movq mm2,mm1 ; transpose coefficients(phase 1)
- punpcklwd mm1,mm3 ; mm1=(04 14 05 15)
- punpckhwd mm2,mm3 ; mm2=(06 16 07 17)
-
- movq mm7,mm6 ; transpose coefficients(phase 2)
- punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0
- punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1
- movq mm3,mm2 ; transpose coefficients(phase 2)
- punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6
- punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7
-
- movq mm0,mm7
- movq mm5,mm6
- psubw mm7,mm2 ; mm7=data1-data6=tmp6
- psubw mm6,mm3 ; mm6=data0-data7=tmp7
- paddw mm0,mm2 ; mm0=data1+data6=tmp1
- paddw mm5,mm3 ; mm5=data0+data7=tmp0
-
- movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
- movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
- movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
- movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
-
- movq mm7,mm4 ; transpose coefficients(phase 2)
- punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2
- punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3
- movq mm6,mm1 ; transpose coefficients(phase 2)
- punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4
- punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5
-
- movq mm2,mm7
- movq mm3,mm4
- paddw mm7,mm1 ; mm7=data3+data4=tmp3
- paddw mm4,mm6 ; mm4=data2+data5=tmp2
- psubw mm2,mm1 ; mm2=data3-data4=tmp4
- psubw mm3,mm6 ; mm3=data2-data5=tmp5
-
- ; -- Even part
-
- movq mm1,mm5
- movq mm6,mm0
- psubw mm5,mm7 ; mm5=tmp13
- psubw mm0,mm4 ; mm0=tmp12
- paddw mm1,mm7 ; mm1=tmp10
- paddw mm6,mm4 ; mm6=tmp11
-
- paddw mm0,mm5
- psllw mm0,PRE_MULTIPLY_SCALE_BITS
- pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
- movq mm7,mm1
- movq mm4,mm5
- psubw mm1,mm6 ; mm1=data4
- psubw mm5,mm0 ; mm5=data6
- paddw mm7,mm6 ; mm7=data0
- paddw mm4,mm0 ; mm4=data2
-
- movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
- movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
- movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
- movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
- ; -- Odd part
-
- movq mm6, MMWORD [wk(0)] ; mm6=tmp6
- movq mm0, MMWORD [wk(1)] ; mm0=tmp7
-
- paddw mm2,mm3 ; mm2=tmp10
- paddw mm3,mm6 ; mm3=tmp11
- paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7
-
- psllw mm2,PRE_MULTIPLY_SCALE_BITS
- psllw mm6,PRE_MULTIPLY_SCALE_BITS
-
- psllw mm3,PRE_MULTIPLY_SCALE_BITS
- pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
- movq mm1,mm2 ; mm1=tmp10
- psubw mm2,mm6
- pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
- pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
- pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
- paddw mm1,mm2 ; mm1=z2
- paddw mm6,mm2 ; mm6=z4
-
- movq mm5,mm0
- psubw mm0,mm3 ; mm0=z13
- paddw mm5,mm3 ; mm5=z11
-
- movq mm7,mm0
- movq mm4,mm5
- psubw mm0,mm1 ; mm0=data3
- psubw mm5,mm6 ; mm5=data7
- paddw mm7,mm1 ; mm7=data5
- paddw mm4,mm6 ; mm4=data1
-
- movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
- movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
- movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
- movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
- add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
- dec ecx
- jnz near .rowloop
-
- ; ---- Pass 2: process columns.
-
- mov edx, POINTER [data(eax)] ; (DCTELEM *)
- mov ecx, DCTSIZE/4
- alignx 16,7
-.columnloop:
-
- movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
- movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
- movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
- ; mm0=(02 12 22 32), mm2=(42 52 62 72)
- ; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
- movq mm4,mm0 ; transpose coefficients(phase 1)
- punpcklwd mm0,mm1 ; mm0=(02 03 12 13)
- punpckhwd mm4,mm1 ; mm4=(22 23 32 33)
- movq mm5,mm2 ; transpose coefficients(phase 1)
- punpcklwd mm2,mm3 ; mm2=(42 43 52 53)
- punpckhwd mm5,mm3 ; mm5=(62 63 72 73)
-
- movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
- movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
- movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
- ; mm6=(00 10 20 30), mm1=(40 50 60 70)
- ; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
- movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
-
- movq mm4,mm6 ; transpose coefficients(phase 1)
- punpcklwd mm6,mm7 ; mm6=(00 01 10 11)
- punpckhwd mm4,mm7 ; mm4=(20 21 30 31)
- movq mm2,mm1 ; transpose coefficients(phase 1)
- punpcklwd mm1,mm3 ; mm1=(40 41 50 51)
- punpckhwd mm2,mm3 ; mm2=(60 61 70 71)
-
- movq mm7,mm6 ; transpose coefficients(phase 2)
- punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0
- punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1
- movq mm3,mm2 ; transpose coefficients(phase 2)
- punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6
- punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7
-
- movq mm0,mm7
- movq mm5,mm6
- psubw mm7,mm2 ; mm7=data1-data6=tmp6
- psubw mm6,mm3 ; mm6=data0-data7=tmp7
- paddw mm0,mm2 ; mm0=data1+data6=tmp1
- paddw mm5,mm3 ; mm5=data0+data7=tmp0
-
- movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
- movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
- movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
- movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
-
- movq mm7,mm4 ; transpose coefficients(phase 2)
- punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2
- punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3
- movq mm6,mm1 ; transpose coefficients(phase 2)
- punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4
- punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5
-
- movq mm2,mm7
- movq mm3,mm4
- paddw mm7,mm1 ; mm7=data3+data4=tmp3
- paddw mm4,mm6 ; mm4=data2+data5=tmp2
- psubw mm2,mm1 ; mm2=data3-data4=tmp4
- psubw mm3,mm6 ; mm3=data2-data5=tmp5
-
- ; -- Even part
-
- movq mm1,mm5
- movq mm6,mm0
- psubw mm5,mm7 ; mm5=tmp13
- psubw mm0,mm4 ; mm0=tmp12
- paddw mm1,mm7 ; mm1=tmp10
- paddw mm6,mm4 ; mm6=tmp11
-
- paddw mm0,mm5
- psllw mm0,PRE_MULTIPLY_SCALE_BITS
- pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
- movq mm7,mm1
- movq mm4,mm5
- psubw mm1,mm6 ; mm1=data4
- psubw mm5,mm0 ; mm5=data6
- paddw mm7,mm6 ; mm7=data0
- paddw mm4,mm0 ; mm4=data2
-
- movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
- movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
- movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
- movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
- ; -- Odd part
-
- movq mm6, MMWORD [wk(0)] ; mm6=tmp6
- movq mm0, MMWORD [wk(1)] ; mm0=tmp7
-
- paddw mm2,mm3 ; mm2=tmp10
- paddw mm3,mm6 ; mm3=tmp11
- paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7
-
- psllw mm2,PRE_MULTIPLY_SCALE_BITS
- psllw mm6,PRE_MULTIPLY_SCALE_BITS
-
- psllw mm3,PRE_MULTIPLY_SCALE_BITS
- pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
- movq mm1,mm2 ; mm1=tmp10
- psubw mm2,mm6
- pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
- pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
- pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
- paddw mm1,mm2 ; mm1=z2
- paddw mm6,mm2 ; mm6=z4
-
- movq mm5,mm0
- psubw mm0,mm3 ; mm0=z13
- paddw mm5,mm3 ; mm5=z11
-
- movq mm7,mm0
- movq mm4,mm5
- psubw mm0,mm1 ; mm0=data3
- psubw mm5,mm6 ; mm5=data7
- paddw mm7,mm1 ; mm7=data5
- paddw mm4,mm6 ; mm4=data1
-
- movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
- movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
- movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
- movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
- add edx, byte 4*SIZEOF_DCTELEM
- dec ecx
- jnz near .columnloop
-
- emms ; empty MMX state
-
-; pop edi ; unused
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jfdctfst-sse2-64.asm b/media/libjpeg/simd/jfdctfst-sse2-64.asm
deleted file mode 100644
index 4c96685427..0000000000
--- a/media/libjpeg/simd/jfdctfst-sse2-64.asm
+++ /dev/null
@@ -1,391 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 8 ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ 98 ; FIX(0.382683433)
-F_0_541 equ 139 ; FIX(0.541196100)
-F_0_707 equ 181 ; FIX(0.707106781)
-F_1_306 equ 334 ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
-F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
- alignz 16
- global EXTN(jconst_fdct_ifast_sse2)
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
-PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
-PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
-PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM *data)
-;
-
-; r10 = DCTELEM *data
-
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_fdct_ifast_sse2)
-
-EXTN(jsimd_fdct_ifast_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
-
- ; ---- Pass 1: process rows.
-
- mov rdx, r10 ; (DCTELEM *)
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
- ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
- ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
- punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
-
- movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
- ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
- ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
-
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
- movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
- punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
- punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
- movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
- movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
-
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
- punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
- movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
- punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
- punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
- movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
- punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
- movdqa xmm6,xmm1
- movdqa xmm3,xmm0
- psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
- psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
- paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
- paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
-
- movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
- movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
- punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
- movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
- punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
- movdqa xmm2,xmm1
- movdqa xmm5,xmm7
- paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
- paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
- psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
- psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm0,xmm6
- psubw xmm3,xmm1 ; xmm3=tmp13
- psubw xmm6,xmm7 ; xmm6=tmp12
- paddw xmm4,xmm1 ; xmm4=tmp10
- paddw xmm0,xmm7 ; xmm0=tmp11
-
- paddw xmm6,xmm3
- psllw xmm6,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm6,[rel PW_F0707] ; xmm6=z1
-
- movdqa xmm1,xmm4
- movdqa xmm7,xmm3
- psubw xmm4,xmm0 ; xmm4=data4
- psubw xmm3,xmm6 ; xmm3=data6
- paddw xmm1,xmm0 ; xmm1=data0
- paddw xmm7,xmm6 ; xmm7=data2
-
- movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
-
- ; -- Odd part
-
- paddw xmm2,xmm5 ; xmm2=tmp10
- paddw xmm5,xmm0 ; xmm5=tmp11
- paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
-
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
-
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[rel PW_F0707] ; xmm5=z3
-
- movdqa xmm4,xmm2 ; xmm4=tmp10
- psubw xmm2,xmm0
- pmulhw xmm2,[rel PW_F0382] ; xmm2=z5
- pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
- pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
- paddw xmm4,xmm2 ; xmm4=z2
- paddw xmm0,xmm2 ; xmm0=z4
-
- movdqa xmm3,xmm6
- psubw xmm6,xmm5 ; xmm6=z13
- paddw xmm3,xmm5 ; xmm3=z11
-
- movdqa xmm2,xmm6
- movdqa xmm5,xmm3
- psubw xmm6,xmm4 ; xmm6=data3
- psubw xmm3,xmm0 ; xmm3=data7
- paddw xmm2,xmm4 ; xmm2=data5
- paddw xmm5,xmm0 ; xmm5=data1
-
- ; ---- Pass 2: process columns.
-
- ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
- ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
- punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
- movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
- punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
-
- ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
- ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
-
- movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
- punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
- movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
- punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
-
- movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
- punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
- punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
- movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
- punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
- punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
-
- movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
-
- movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
- punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
- movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
- punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
-
- movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
- punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
- movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
- punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
- movdqa xmm5,xmm6
- movdqa xmm3,xmm1
- psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
- psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
- paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
- paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
- movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
-
- movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
- punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
- movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
- punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
- movdqa xmm7,xmm6
- movdqa xmm0,xmm2
- paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
- paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
- psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
- psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm1,xmm5
- psubw xmm3,xmm6 ; xmm3=tmp13
- psubw xmm5,xmm2 ; xmm5=tmp12
- paddw xmm4,xmm6 ; xmm4=tmp10
- paddw xmm1,xmm2 ; xmm1=tmp11
-
- paddw xmm5,xmm3
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[rel PW_F0707] ; xmm5=z1
-
- movdqa xmm6,xmm4
- movdqa xmm2,xmm3
- psubw xmm4,xmm1 ; xmm4=data4
- psubw xmm3,xmm5 ; xmm3=data6
- paddw xmm6,xmm1 ; xmm6=data0
- paddw xmm2,xmm5 ; xmm2=data2
-
- movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
- movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
- movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
-
- ; -- Odd part
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
-
- paddw xmm7,xmm0 ; xmm7=tmp10
- paddw xmm0,xmm1 ; xmm0=tmp11
- paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
-
- psllw xmm7,PRE_MULTIPLY_SCALE_BITS
- psllw xmm1,PRE_MULTIPLY_SCALE_BITS
-
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm0,[rel PW_F0707] ; xmm0=z3
-
- movdqa xmm4,xmm7 ; xmm4=tmp10
- psubw xmm7,xmm1
- pmulhw xmm7,[rel PW_F0382] ; xmm7=z5
- pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
- pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
- paddw xmm4,xmm7 ; xmm4=z2
- paddw xmm1,xmm7 ; xmm1=z4
-
- movdqa xmm3,xmm5
- psubw xmm5,xmm0 ; xmm5=z13
- paddw xmm3,xmm0 ; xmm3=z11
-
- movdqa xmm6,xmm5
- movdqa xmm2,xmm3
- psubw xmm5,xmm4 ; xmm5=data3
- psubw xmm3,xmm1 ; xmm3=data7
- paddw xmm6,xmm4 ; xmm6=data5
- paddw xmm2,xmm1 ; xmm2=data1
-
- movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
- movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
- movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
- movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jfdctfst-sse2.asm b/media/libjpeg/simd/jfdctfst-sse2.asm
deleted file mode 100644
index 54856a2363..0000000000
--- a/media/libjpeg/simd/jfdctfst-sse2.asm
+++ /dev/null
@@ -1,403 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 8 ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ 98 ; FIX(0.382683433)
-F_0_541 equ 139 ; FIX(0.541196100)
-F_0_707 equ 181 ; FIX(0.707106781)
-F_1_306 equ 334 ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
-F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
- alignz 16
- global EXTN(jconst_fdct_ifast_sse2)
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
-PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
-PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
-PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM *data)
-;
-
-%define data(b) (b)+8 ; DCTELEM *data
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_fdct_ifast_sse2)
-
-EXTN(jsimd_fdct_ifast_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
-; push esi ; unused
-; push edi ; unused
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process rows.
-
- mov edx, POINTER [data(eax)] ; (DCTELEM *)
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
- ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
- ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
- punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
-
- movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
- ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
- ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
-
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
- movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
- punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
- punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
- movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
- movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
-
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
- punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
- movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
- punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
- punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
- movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
- punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
- movdqa xmm6,xmm1
- movdqa xmm3,xmm0
- psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
- psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
- paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
- paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
-
- movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
- movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
- punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
- movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
- punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
- movdqa xmm2,xmm1
- movdqa xmm5,xmm7
- paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
- paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
- psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
- psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm0,xmm6
- psubw xmm3,xmm1 ; xmm3=tmp13
- psubw xmm6,xmm7 ; xmm6=tmp12
- paddw xmm4,xmm1 ; xmm4=tmp10
- paddw xmm0,xmm7 ; xmm0=tmp11
-
- paddw xmm6,xmm3
- psllw xmm6,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
-
- movdqa xmm1,xmm4
- movdqa xmm7,xmm3
- psubw xmm4,xmm0 ; xmm4=data4
- psubw xmm3,xmm6 ; xmm3=data6
- paddw xmm1,xmm0 ; xmm1=data0
- paddw xmm7,xmm6 ; xmm7=data2
-
- movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
-
- ; -- Odd part
-
- paddw xmm2,xmm5 ; xmm2=tmp10
- paddw xmm5,xmm0 ; xmm5=tmp11
- paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
-
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
-
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
-
- movdqa xmm4,xmm2 ; xmm4=tmp10
- psubw xmm2,xmm0
- pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
- pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
- pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
- paddw xmm4,xmm2 ; xmm4=z2
- paddw xmm0,xmm2 ; xmm0=z4
-
- movdqa xmm3,xmm6
- psubw xmm6,xmm5 ; xmm6=z13
- paddw xmm3,xmm5 ; xmm3=z11
-
- movdqa xmm2,xmm6
- movdqa xmm5,xmm3
- psubw xmm6,xmm4 ; xmm6=data3
- psubw xmm3,xmm0 ; xmm3=data7
- paddw xmm2,xmm4 ; xmm2=data5
- paddw xmm5,xmm0 ; xmm5=data1
-
- ; ---- Pass 2: process columns.
-
-; mov edx, POINTER [data(eax)] ; (DCTELEM *)
-
- ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
- ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
- punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
- movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
- punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
-
- ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
- ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
-
- movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
- punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
- movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
- punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
-
- movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
- punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
- punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
- movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
- punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
- punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
-
- movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
-
- movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
- punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
- movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
- punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
-
- movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
- punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
- movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
- punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
- movdqa xmm5,xmm6
- movdqa xmm3,xmm1
- psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
- psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
- paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
- paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
- movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
-
- movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
- punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
- movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
- punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
- movdqa xmm7,xmm6
- movdqa xmm0,xmm2
- paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
- paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
- psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
- psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm1,xmm5
- psubw xmm3,xmm6 ; xmm3=tmp13
- psubw xmm5,xmm2 ; xmm5=tmp12
- paddw xmm4,xmm6 ; xmm4=tmp10
- paddw xmm1,xmm2 ; xmm1=tmp11
-
- paddw xmm5,xmm3
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
-
- movdqa xmm6,xmm4
- movdqa xmm2,xmm3
- psubw xmm4,xmm1 ; xmm4=data4
- psubw xmm3,xmm5 ; xmm3=data6
- paddw xmm6,xmm1 ; xmm6=data0
- paddw xmm2,xmm5 ; xmm2=data2
-
- movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
- movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
- movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
-
- ; -- Odd part
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
-
- paddw xmm7,xmm0 ; xmm7=tmp10
- paddw xmm0,xmm1 ; xmm0=tmp11
- paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
-
- psllw xmm7,PRE_MULTIPLY_SCALE_BITS
- psllw xmm1,PRE_MULTIPLY_SCALE_BITS
-
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
-
- movdqa xmm4,xmm7 ; xmm4=tmp10
- psubw xmm7,xmm1
- pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
- pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
- pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
- paddw xmm4,xmm7 ; xmm4=z2
- paddw xmm1,xmm7 ; xmm1=z4
-
- movdqa xmm3,xmm5
- psubw xmm5,xmm0 ; xmm5=z13
- paddw xmm3,xmm0 ; xmm3=z11
-
- movdqa xmm6,xmm5
- movdqa xmm2,xmm3
- psubw xmm5,xmm4 ; xmm5=data3
- psubw xmm3,xmm1 ; xmm3=data7
- paddw xmm6,xmm4 ; xmm6=data5
- paddw xmm2,xmm1 ; xmm2=data1
-
- movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
- movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
- movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
- movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
-
-; pop edi ; unused
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jfdctint-altivec.c b/media/libjpeg/simd/jfdctint-altivec.c
deleted file mode 100644
index e6e8a5687e..0000000000
--- a/media/libjpeg/simd/jfdctint-altivec.c
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* SLOW INTEGER FORWARD DCT */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
-
-
-#define DO_FDCT_COMMON(PASS) \
-{ \
- /* (Original) \
- * z1 = (tmp12 + tmp13) * 0.541196100; \
- * data2 = z1 + tmp13 * 0.765366865; \
- * data6 = z1 + tmp12 * -1.847759065; \
- * \
- * (This implementation) \
- * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
- * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
- */ \
- \
- tmp1312l = vec_mergeh(tmp13, tmp12); \
- tmp1312h = vec_mergel(tmp13, tmp12); \
- \
- out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
- out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
- out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
- out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
- \
- out2l = vec_sra(out2l, descale_p##PASS); \
- out2h = vec_sra(out2h, descale_p##PASS); \
- out6l = vec_sra(out6l, descale_p##PASS); \
- out6h = vec_sra(out6h, descale_p##PASS); \
- \
- out2 = vec_pack(out2l, out2h); \
- out6 = vec_pack(out6l, out6h); \
- \
- /* Odd part */ \
- \
- z3 = vec_add(tmp4, tmp6); \
- z4 = vec_add(tmp5, tmp7); \
- \
- /* (Original) \
- * z5 = (z3 + z4) * 1.175875602; \
- * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
- * z3 += z5; z4 += z5; \
- * \
- * (This implementation) \
- * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
- * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
- */ \
- \
- z34l = vec_mergeh(z3, z4); \
- z34h = vec_mergel(z3, z4); \
- \
- z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
- z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
- z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
- z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
- \
- /* (Original) \
- * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
- * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
- * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
- * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
- * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
- * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
- * \
- * (This implementation) \
- * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
- * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
- * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
- * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
- * data7 = tmp4 + z3; data5 = tmp5 + z4; \
- * data3 = tmp6 + z3; data1 = tmp7 + z4; \
- */ \
- \
- tmp47l = vec_mergeh(tmp4, tmp7); \
- tmp47h = vec_mergel(tmp4, tmp7); \
- \
- out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
- out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
- out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
- out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
- \
- out7l = vec_sra(out7l, descale_p##PASS); \
- out7h = vec_sra(out7h, descale_p##PASS); \
- out1l = vec_sra(out1l, descale_p##PASS); \
- out1h = vec_sra(out1h, descale_p##PASS); \
- \
- out7 = vec_pack(out7l, out7h); \
- out1 = vec_pack(out1l, out1h); \
- \
- tmp56l = vec_mergeh(tmp5, tmp6); \
- tmp56h = vec_mergel(tmp5, tmp6); \
- \
- out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
- out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
- out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
- out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
- \
- out5l = vec_sra(out5l, descale_p##PASS); \
- out5h = vec_sra(out5h, descale_p##PASS); \
- out3l = vec_sra(out3l, descale_p##PASS); \
- out3h = vec_sra(out3h, descale_p##PASS); \
- \
- out5 = vec_pack(out5l, out5h); \
- out3 = vec_pack(out3l, out3h); \
-}
-
-#define DO_FDCT_PASS1() \
-{ \
- /* Even part */ \
- \
- tmp10 = vec_add(tmp0, tmp3); \
- tmp13 = vec_sub(tmp0, tmp3); \
- tmp11 = vec_add(tmp1, tmp2); \
- tmp12 = vec_sub(tmp1, tmp2); \
- \
- out0 = vec_add(tmp10, tmp11); \
- out0 = vec_sl(out0, pass1_bits); \
- out4 = vec_sub(tmp10, tmp11); \
- out4 = vec_sl(out4, pass1_bits); \
- \
- DO_FDCT_COMMON(1); \
-}
-
-#define DO_FDCT_PASS2() \
-{ \
- /* Even part */ \
- \
- tmp10 = vec_add(tmp0, tmp3); \
- tmp13 = vec_sub(tmp0, tmp3); \
- tmp11 = vec_add(tmp1, tmp2); \
- tmp12 = vec_sub(tmp1, tmp2); \
- \
- out0 = vec_add(tmp10, tmp11); \
- out0 = vec_add(out0, pw_descale_p2x); \
- out0 = vec_sra(out0, pass1_bits); \
- out4 = vec_sub(tmp10, tmp11); \
- out4 = vec_add(out4, pw_descale_p2x); \
- out4 = vec_sra(out4, pass1_bits); \
- \
- DO_FDCT_COMMON(2); \
-}
-
-
-void
-jsimd_fdct_islow_altivec (DCTELEM *data)
-{
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- col0, col1, col2, col3, col4, col5, col6, col7,
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
- tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
- z3, z4, z34l, z34h,
- out0, out1, out2, out3, out4, out5, out6, out7;
- __vector int z3l, z3h, z4l, z4h,
- out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
- out7l, out7h;
-
- /* Constants */
- __vector short
- pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
- pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
- pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
- pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
- pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
- pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
- pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
- pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
- pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
- __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
- __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
- pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
- __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
- descale_p2 = { __4X(DESCALE_P2) };
-
- /* Pass 1: process rows */
-
- row0 = vec_ld(0, data);
- row1 = vec_ld(16, data);
- row2 = vec_ld(32, data);
- row3 = vec_ld(48, data);
- row4 = vec_ld(64, data);
- row5 = vec_ld(80, data);
- row6 = vec_ld(96, data);
- row7 = vec_ld(112, data);
-
- TRANSPOSE(row, col);
-
- tmp0 = vec_add(col0, col7);
- tmp7 = vec_sub(col0, col7);
- tmp1 = vec_add(col1, col6);
- tmp6 = vec_sub(col1, col6);
- tmp2 = vec_add(col2, col5);
- tmp5 = vec_sub(col2, col5);
- tmp3 = vec_add(col3, col4);
- tmp4 = vec_sub(col3, col4);
-
- DO_FDCT_PASS1();
-
- /* Pass 2: process columns */
-
- TRANSPOSE(out, row);
-
- tmp0 = vec_add(row0, row7);
- tmp7 = vec_sub(row0, row7);
- tmp1 = vec_add(row1, row6);
- tmp6 = vec_sub(row1, row6);
- tmp2 = vec_add(row2, row5);
- tmp5 = vec_sub(row2, row5);
- tmp3 = vec_add(row3, row4);
- tmp4 = vec_sub(row3, row4);
-
- DO_FDCT_PASS2();
-
- vec_st(out0, 0, data);
- vec_st(out1, 16, data);
- vec_st(out2, 32, data);
- vec_st(out3, 48, data);
- vec_st(out4, 64, data);
- vec_st(out5, 80, data);
- vec_st(out6, 96, data);
- vec_st(out7, 112, data);
-}
diff --git a/media/libjpeg/simd/jfdctint-mmx.asm b/media/libjpeg/simd/jfdctint-mmx.asm
deleted file mode 100644
index 9142ad8816..0000000000
--- a/media/libjpeg/simd/jfdctint-mmx.asm
+++ /dev/null
@@ -1,621 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 13
-%define PASS1_BITS 2
-
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_fdct_islow_mmx)
-
-EXTN(jconst_fdct_islow_mmx):
-
-PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1)
-PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1)
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_mmx (DCTELEM *data)
-;
-
-%define data(b) (b)+8 ; DCTELEM *data
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_fdct_islow_mmx)
-
-EXTN(jsimd_fdct_islow_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
-; push esi ; unused
-; push edi ; unused
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process rows.
-
- mov edx, POINTER [data(eax)] ; (DCTELEM *)
- mov ecx, DCTSIZE/4
- alignx 16,7
-.rowloop:
-
- movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
- movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
- movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
- ; mm0=(20 21 22 23), mm2=(24 25 26 27)
- ; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
- movq mm4,mm0 ; transpose coefficients(phase 1)
- punpcklwd mm0,mm1 ; mm0=(20 30 21 31)
- punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
- movq mm5,mm2 ; transpose coefficients(phase 1)
- punpcklwd mm2,mm3 ; mm2=(24 34 25 35)
- punpckhwd mm5,mm3 ; mm5=(26 36 27 37)
-
- movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
- movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
- movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
- ; mm6=(00 01 02 03), mm1=(04 05 06 07)
- ; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
- movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
-
- movq mm4,mm6 ; transpose coefficients(phase 1)
- punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
- punpckhwd mm4,mm7 ; mm4=(02 12 03 13)
- movq mm2,mm1 ; transpose coefficients(phase 1)
- punpcklwd mm1,mm3 ; mm1=(04 14 05 15)
- punpckhwd mm2,mm3 ; mm2=(06 16 07 17)
-
- movq mm7,mm6 ; transpose coefficients(phase 2)
- punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0
- punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1
- movq mm3,mm2 ; transpose coefficients(phase 2)
- punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6
- punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7
-
- movq mm0,mm7
- movq mm5,mm6
- psubw mm7,mm2 ; mm7=data1-data6=tmp6
- psubw mm6,mm3 ; mm6=data0-data7=tmp7
- paddw mm0,mm2 ; mm0=data1+data6=tmp1
- paddw mm5,mm3 ; mm5=data0+data7=tmp0
-
- movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
- movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
- movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
- movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
-
- movq mm7,mm4 ; transpose coefficients(phase 2)
- punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2
- punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3
- movq mm6,mm1 ; transpose coefficients(phase 2)
- punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4
- punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5
-
- movq mm2,mm7
- movq mm3,mm4
- paddw mm7,mm1 ; mm7=data3+data4=tmp3
- paddw mm4,mm6 ; mm4=data2+data5=tmp2
- psubw mm2,mm1 ; mm2=data3-data4=tmp4
- psubw mm3,mm6 ; mm3=data2-data5=tmp5
-
- ; -- Even part
-
- movq mm1,mm5
- movq mm6,mm0
- paddw mm5,mm7 ; mm5=tmp10
- paddw mm0,mm4 ; mm0=tmp11
- psubw mm1,mm7 ; mm1=tmp13
- psubw mm6,mm4 ; mm6=tmp12
-
- movq mm7,mm5
- paddw mm5,mm0 ; mm5=tmp10+tmp11
- psubw mm7,mm0 ; mm7=tmp10-tmp11
-
- psllw mm5,PASS1_BITS ; mm5=data0
- psllw mm7,PASS1_BITS ; mm7=data4
-
- movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
- movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movq mm4,mm1 ; mm1=tmp13
- movq mm0,mm1
- punpcklwd mm4,mm6 ; mm6=tmp12
- punpckhwd mm0,mm6
- movq mm1,mm4
- movq mm6,mm0
- pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
- pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
- pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
- pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
-
- paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad mm4,DESCALE_P1
- psrad mm0,DESCALE_P1
- paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad mm1,DESCALE_P1
- psrad mm6,DESCALE_P1
-
- packssdw mm4,mm0 ; mm4=data2
- packssdw mm1,mm6 ; mm1=data6
-
- movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
- movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
-
- ; -- Odd part
-
- movq mm5, MMWORD [wk(0)] ; mm5=tmp6
- movq mm7, MMWORD [wk(1)] ; mm7=tmp7
-
- movq mm0,mm2 ; mm2=tmp4
- movq mm6,mm3 ; mm3=tmp5
- paddw mm0,mm5 ; mm0=z3
- paddw mm6,mm7 ; mm6=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movq mm4,mm0
- movq mm1,mm0
- punpcklwd mm4,mm6
- punpckhwd mm1,mm6
- movq mm0,mm4
- movq mm6,mm1
- pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
- pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
- pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
- pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
- movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movq mm4,mm2
- movq mm1,mm2
- punpcklwd mm4,mm7
- punpckhwd mm1,mm7
- movq mm2,mm4
- movq mm7,mm1
- pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
- pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
- pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
- pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
-
- paddd mm4, MMWORD [wk(0)] ; mm4=data7L
- paddd mm1, MMWORD [wk(1)] ; mm1=data7H
- paddd mm2,mm0 ; mm2=data1L
- paddd mm7,mm6 ; mm7=data1H
-
- paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad mm4,DESCALE_P1
- psrad mm1,DESCALE_P1
- paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad mm2,DESCALE_P1
- psrad mm7,DESCALE_P1
-
- packssdw mm4,mm1 ; mm4=data7
- packssdw mm2,mm7 ; mm2=data1
-
- movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
- movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
- movq mm1,mm3
- movq mm7,mm3
- punpcklwd mm1,mm5
- punpckhwd mm7,mm5
- movq mm3,mm1
- movq mm5,mm7
- pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
- pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
- pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
- pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
-
- paddd mm1,mm0 ; mm1=data5L
- paddd mm7,mm6 ; mm7=data5H
- paddd mm3, MMWORD [wk(0)] ; mm3=data3L
- paddd mm5, MMWORD [wk(1)] ; mm5=data3H
-
- paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad mm1,DESCALE_P1
- psrad mm7,DESCALE_P1
- paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad mm3,DESCALE_P1
- psrad mm5,DESCALE_P1
-
- packssdw mm1,mm7 ; mm1=data5
- packssdw mm3,mm5 ; mm3=data3
-
- movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
- movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
- add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
- dec ecx
- jnz near .rowloop
-
- ; ---- Pass 2: process columns.
-
- mov edx, POINTER [data(eax)] ; (DCTELEM *)
- mov ecx, DCTSIZE/4
- alignx 16,7
-.columnloop:
-
- movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
- movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
- movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
- ; mm0=(02 12 22 32), mm2=(42 52 62 72)
- ; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
- movq mm4,mm0 ; transpose coefficients(phase 1)
- punpcklwd mm0,mm1 ; mm0=(02 03 12 13)
- punpckhwd mm4,mm1 ; mm4=(22 23 32 33)
- movq mm5,mm2 ; transpose coefficients(phase 1)
- punpcklwd mm2,mm3 ; mm2=(42 43 52 53)
- punpckhwd mm5,mm3 ; mm5=(62 63 72 73)
-
- movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
- movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
- movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
- ; mm6=(00 10 20 30), mm1=(40 50 60 70)
- ; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
- movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
-
- movq mm4,mm6 ; transpose coefficients(phase 1)
- punpcklwd mm6,mm7 ; mm6=(00 01 10 11)
- punpckhwd mm4,mm7 ; mm4=(20 21 30 31)
- movq mm2,mm1 ; transpose coefficients(phase 1)
- punpcklwd mm1,mm3 ; mm1=(40 41 50 51)
- punpckhwd mm2,mm3 ; mm2=(60 61 70 71)
-
- movq mm7,mm6 ; transpose coefficients(phase 2)
- punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0
- punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1
- movq mm3,mm2 ; transpose coefficients(phase 2)
- punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6
- punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7
-
- movq mm0,mm7
- movq mm5,mm6
- psubw mm7,mm2 ; mm7=data1-data6=tmp6
- psubw mm6,mm3 ; mm6=data0-data7=tmp7
- paddw mm0,mm2 ; mm0=data1+data6=tmp1
- paddw mm5,mm3 ; mm5=data0+data7=tmp0
-
- movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
- movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
- movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
- movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
-
- movq mm7,mm4 ; transpose coefficients(phase 2)
- punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2
- punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3
- movq mm6,mm1 ; transpose coefficients(phase 2)
- punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4
- punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5
-
- movq mm2,mm7
- movq mm3,mm4
- paddw mm7,mm1 ; mm7=data3+data4=tmp3
- paddw mm4,mm6 ; mm4=data2+data5=tmp2
- psubw mm2,mm1 ; mm2=data3-data4=tmp4
- psubw mm3,mm6 ; mm3=data2-data5=tmp5
-
- ; -- Even part
-
- movq mm1,mm5
- movq mm6,mm0
- paddw mm5,mm7 ; mm5=tmp10
- paddw mm0,mm4 ; mm0=tmp11
- psubw mm1,mm7 ; mm1=tmp13
- psubw mm6,mm4 ; mm6=tmp12
-
- movq mm7,mm5
- paddw mm5,mm0 ; mm5=tmp10+tmp11
- psubw mm7,mm0 ; mm7=tmp10-tmp11
-
- paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
- paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
- psraw mm5,PASS1_BITS ; mm5=data0
- psraw mm7,PASS1_BITS ; mm7=data4
-
- movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
- movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movq mm4,mm1 ; mm1=tmp13
- movq mm0,mm1
- punpcklwd mm4,mm6 ; mm6=tmp12
- punpckhwd mm0,mm6
- movq mm1,mm4
- movq mm6,mm0
- pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
- pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
- pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
- pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
-
- paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad mm4,DESCALE_P2
- psrad mm0,DESCALE_P2
- paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad mm1,DESCALE_P2
- psrad mm6,DESCALE_P2
-
- packssdw mm4,mm0 ; mm4=data2
- packssdw mm1,mm6 ; mm1=data6
-
- movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
- movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
-
- ; -- Odd part
-
- movq mm5, MMWORD [wk(0)] ; mm5=tmp6
- movq mm7, MMWORD [wk(1)] ; mm7=tmp7
-
- movq mm0,mm2 ; mm2=tmp4
- movq mm6,mm3 ; mm3=tmp5
- paddw mm0,mm5 ; mm0=z3
- paddw mm6,mm7 ; mm6=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movq mm4,mm0
- movq mm1,mm0
- punpcklwd mm4,mm6
- punpckhwd mm1,mm6
- movq mm0,mm4
- movq mm6,mm1
- pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
- pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
- pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
- pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
- movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movq mm4,mm2
- movq mm1,mm2
- punpcklwd mm4,mm7
- punpckhwd mm1,mm7
- movq mm2,mm4
- movq mm7,mm1
- pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
- pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
- pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
- pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
-
- paddd mm4, MMWORD [wk(0)] ; mm4=data7L
- paddd mm1, MMWORD [wk(1)] ; mm1=data7H
- paddd mm2,mm0 ; mm2=data1L
- paddd mm7,mm6 ; mm7=data1H
-
- paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad mm4,DESCALE_P2
- psrad mm1,DESCALE_P2
- paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad mm2,DESCALE_P2
- psrad mm7,DESCALE_P2
-
- packssdw mm4,mm1 ; mm4=data7
- packssdw mm2,mm7 ; mm2=data1
-
- movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
- movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
- movq mm1,mm3
- movq mm7,mm3
- punpcklwd mm1,mm5
- punpckhwd mm7,mm5
- movq mm3,mm1
- movq mm5,mm7
- pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
- pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
- pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
- pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
-
- paddd mm1,mm0 ; mm1=data5L
- paddd mm7,mm6 ; mm7=data5H
- paddd mm3, MMWORD [wk(0)] ; mm3=data3L
- paddd mm5, MMWORD [wk(1)] ; mm5=data3H
-
- paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad mm1,DESCALE_P2
- psrad mm7,DESCALE_P2
- paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad mm3,DESCALE_P2
- psrad mm5,DESCALE_P2
-
- packssdw mm1,mm7 ; mm1=data5
- packssdw mm3,mm5 ; mm3=data3
-
- movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
- movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
- add edx, byte 4*SIZEOF_DCTELEM
- dec ecx
- jnz near .columnloop
-
- emms ; empty MMX state
-
-; pop edi ; unused
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jfdctint-sse2-64.asm b/media/libjpeg/simd/jfdctint-sse2-64.asm
deleted file mode 100644
index 9a0ca0fd28..0000000000
--- a/media/libjpeg/simd/jfdctint-sse2-64.asm
+++ /dev/null
@@ -1,621 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 13
-%define PASS1_BITS 2
-
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_fdct_islow_sse2)
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
-PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM *data)
-;
-
-; r10 = DCTELEM *data
-
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 6
-
- align 16
- global EXTN(jsimd_fdct_islow_sse2)
-
-EXTN(jsimd_fdct_islow_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
-
- ; ---- Pass 1: process rows.
-
- mov rdx, r10 ; (DCTELEM *)
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
- ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
- ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
- punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
-
- movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
- ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
- ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
-
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
- movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
- punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
- punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
- movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
- movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
- movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
-
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
- punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
- movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
- punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
- punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
- movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
- punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
- movdqa xmm6,xmm1
- movdqa xmm3,xmm0
- psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
- psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
- paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
- paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
-
- movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
- movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
- punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
- movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
- punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
- movdqa xmm2,xmm1
- movdqa xmm5,xmm7
- paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
- paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
- psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
- psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm0,xmm6
- paddw xmm3,xmm1 ; xmm3=tmp10
- paddw xmm6,xmm7 ; xmm6=tmp11
- psubw xmm4,xmm1 ; xmm4=tmp13
- psubw xmm0,xmm7 ; xmm0=tmp12
-
- movdqa xmm1,xmm3
- paddw xmm3,xmm6 ; xmm3=tmp10+tmp11
- psubw xmm1,xmm6 ; xmm1=tmp10-tmp11
-
- psllw xmm3,PASS1_BITS ; xmm3=data0
- psllw xmm1,PASS1_BITS ; xmm1=data4
-
- movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
- movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movdqa xmm7,xmm4 ; xmm4=tmp13
- movdqa xmm6,xmm4
- punpcklwd xmm7,xmm0 ; xmm0=tmp12
- punpckhwd xmm6,xmm0
- movdqa xmm4,xmm7
- movdqa xmm0,xmm6
- pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L
- pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H
- pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L
- pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H
-
- paddd xmm7,[rel PD_DESCALE_P1]
- paddd xmm6,[rel PD_DESCALE_P1]
- psrad xmm7,DESCALE_P1
- psrad xmm6,DESCALE_P1
- paddd xmm4,[rel PD_DESCALE_P1]
- paddd xmm0,[rel PD_DESCALE_P1]
- psrad xmm4,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm7,xmm6 ; xmm7=data2
- packssdw xmm4,xmm0 ; xmm4=data6
-
- movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
- movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
-
- ; -- Odd part
-
- movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
-
- movdqa xmm6,xmm2 ; xmm2=tmp4
- movdqa xmm0,xmm5 ; xmm5=tmp5
- paddw xmm6,xmm3 ; xmm6=z3
- paddw xmm0,xmm1 ; xmm0=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm7,xmm6
- movdqa xmm4,xmm6
- punpcklwd xmm7,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm6,xmm7
- movdqa xmm0,xmm4
- pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L
- pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H
- pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L
- pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
- movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movdqa xmm7,xmm2
- movdqa xmm4,xmm2
- punpcklwd xmm7,xmm1
- punpckhwd xmm4,xmm1
- movdqa xmm2,xmm7
- movdqa xmm1,xmm4
- pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L
- pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H
- pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L
- pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H
-
- paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
- paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
- paddd xmm2,xmm6 ; xmm2=data1L
- paddd xmm1,xmm0 ; xmm1=data1H
-
- paddd xmm7,[rel PD_DESCALE_P1]
- paddd xmm4,[rel PD_DESCALE_P1]
- psrad xmm7,DESCALE_P1
- psrad xmm4,DESCALE_P1
- paddd xmm2,[rel PD_DESCALE_P1]
- paddd xmm1,[rel PD_DESCALE_P1]
- psrad xmm2,DESCALE_P1
- psrad xmm1,DESCALE_P1
-
- packssdw xmm7,xmm4 ; xmm7=data7
- packssdw xmm2,xmm1 ; xmm2=data1
-
- movdqa xmm4,xmm5
- movdqa xmm1,xmm5
- punpcklwd xmm4,xmm3
- punpckhwd xmm1,xmm3
- movdqa xmm5,xmm4
- movdqa xmm3,xmm1
- pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L
- pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H
- pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L
- pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H
-
- paddd xmm4,xmm6 ; xmm4=data5L
- paddd xmm1,xmm0 ; xmm1=data5H
- paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
- paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
-
- paddd xmm4,[rel PD_DESCALE_P1]
- paddd xmm1,[rel PD_DESCALE_P1]
- psrad xmm4,DESCALE_P1
- psrad xmm1,DESCALE_P1
- paddd xmm5,[rel PD_DESCALE_P1]
- paddd xmm3,[rel PD_DESCALE_P1]
- psrad xmm5,DESCALE_P1
- psrad xmm3,DESCALE_P1
-
- packssdw xmm4,xmm1 ; xmm4=data5
- packssdw xmm5,xmm3 ; xmm5=data3
-
- ; ---- Pass 2: process columns.
-
- movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
- movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
-
- ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
- ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
- movdqa xmm1,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
- punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
- movdqa xmm3,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
- punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
-
- movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
- movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
-
- ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
- ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
-
- movdqa xmm0,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
- punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
- movdqa xmm3,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
- punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
-
- movdqa xmm4,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
- punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
- punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
- movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
- movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
- movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
- punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
- movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
- punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
-
- movdqa xmm5,xmm6 ; transpose coefficients(phase 3)
- punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
- punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
- movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
- punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
- movdqa xmm2,xmm5
- movdqa xmm7,xmm6
- psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6
- psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7
- paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1
- paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0
-
- movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
- movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movdqa xmm5,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
- punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
- movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
- punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
- movdqa xmm0,xmm5
- movdqa xmm3,xmm4
- paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3
- paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2
- psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4
- psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm1,xmm7
- movdqa xmm6,xmm2
- paddw xmm7,xmm5 ; xmm7=tmp10
- paddw xmm2,xmm4 ; xmm2=tmp11
- psubw xmm1,xmm5 ; xmm1=tmp13
- psubw xmm6,xmm4 ; xmm6=tmp12
-
- movdqa xmm5,xmm7
- paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
- psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
-
- paddw xmm7,[rel PW_DESCALE_P2X]
- paddw xmm5,[rel PW_DESCALE_P2X]
- psraw xmm7,PASS1_BITS ; xmm7=data0
- psraw xmm5,PASS1_BITS ; xmm5=data4
-
- movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
- movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movdqa xmm4,xmm1 ; xmm1=tmp13
- movdqa xmm2,xmm1
- punpcklwd xmm4,xmm6 ; xmm6=tmp12
- punpckhwd xmm2,xmm6
- movdqa xmm1,xmm4
- movdqa xmm6,xmm2
- pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L
- pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H
- pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L
- pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H
-
- paddd xmm4,[rel PD_DESCALE_P2]
- paddd xmm2,[rel PD_DESCALE_P2]
- psrad xmm4,DESCALE_P2
- psrad xmm2,DESCALE_P2
- paddd xmm1,[rel PD_DESCALE_P2]
- paddd xmm6,[rel PD_DESCALE_P2]
- psrad xmm1,DESCALE_P2
- psrad xmm6,DESCALE_P2
-
- packssdw xmm4,xmm2 ; xmm4=data2
- packssdw xmm1,xmm6 ; xmm1=data6
-
- movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
-
- ; -- Odd part
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
-
- movdqa xmm2,xmm0 ; xmm0=tmp4
- movdqa xmm6,xmm3 ; xmm3=tmp5
- paddw xmm2,xmm7 ; xmm2=z3
- paddw xmm6,xmm5 ; xmm6=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm4,xmm2
- movdqa xmm1,xmm2
- punpcklwd xmm4,xmm6
- punpckhwd xmm1,xmm6
- movdqa xmm2,xmm4
- movdqa xmm6,xmm1
- pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L
- pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H
- pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L
- pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movdqa xmm4,xmm0
- movdqa xmm1,xmm0
- punpcklwd xmm4,xmm5
- punpckhwd xmm1,xmm5
- movdqa xmm0,xmm4
- movdqa xmm5,xmm1
- pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L
- pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H
- pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L
- pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H
-
- paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
- paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
- paddd xmm0,xmm2 ; xmm0=data1L
- paddd xmm5,xmm6 ; xmm5=data1H
-
- paddd xmm4,[rel PD_DESCALE_P2]
- paddd xmm1,[rel PD_DESCALE_P2]
- psrad xmm4,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm0,[rel PD_DESCALE_P2]
- paddd xmm5,[rel PD_DESCALE_P2]
- psrad xmm0,DESCALE_P2
- psrad xmm5,DESCALE_P2
-
- packssdw xmm4,xmm1 ; xmm4=data7
- packssdw xmm0,xmm5 ; xmm0=data1
-
- movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
-
- movdqa xmm1,xmm3
- movdqa xmm5,xmm3
- punpcklwd xmm1,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm3,xmm1
- movdqa xmm7,xmm5
- pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L
- pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H
- pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L
- pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H
-
- paddd xmm1,xmm2 ; xmm1=data5L
- paddd xmm5,xmm6 ; xmm5=data5H
- paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
- paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
-
- paddd xmm1,[rel PD_DESCALE_P2]
- paddd xmm5,[rel PD_DESCALE_P2]
- psrad xmm1,DESCALE_P2
- psrad xmm5,DESCALE_P2
- paddd xmm3,[rel PD_DESCALE_P2]
- paddd xmm7,[rel PD_DESCALE_P2]
- psrad xmm3,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm1,xmm5 ; xmm1=data5
- packssdw xmm3,xmm7 ; xmm3=data3
-
- movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jfdctint-sse2.asm b/media/libjpeg/simd/jfdctint-sse2.asm
deleted file mode 100644
index db9d0bbe44..0000000000
--- a/media/libjpeg/simd/jfdctint-sse2.asm
+++ /dev/null
@@ -1,633 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 13
-%define PASS1_BITS 2
-
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_fdct_islow_sse2)
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
-PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM *data)
-;
-
-%define data(b) (b)+8 ; DCTELEM *data
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 6
-
- align 16
- global EXTN(jsimd_fdct_islow_sse2)
-
-EXTN(jsimd_fdct_islow_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
-; push esi ; unused
-; push edi ; unused
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process rows.
-
- mov edx, POINTER [data(eax)] ; (DCTELEM *)
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
- ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
- ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
- punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
-
- movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
- ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
- ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
-
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
- movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
- punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
- punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
- movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
- movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
- movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
-
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
- punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
- movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
- punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
- punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
- movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
- punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
- movdqa xmm6,xmm1
- movdqa xmm3,xmm0
- psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
- psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
- paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
- paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
-
- movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
- movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
- punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
- movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
- punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
- movdqa xmm2,xmm1
- movdqa xmm5,xmm7
- paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
- paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
- psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
- psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm0,xmm6
- paddw xmm3,xmm1 ; xmm3=tmp10
- paddw xmm6,xmm7 ; xmm6=tmp11
- psubw xmm4,xmm1 ; xmm4=tmp13
- psubw xmm0,xmm7 ; xmm0=tmp12
-
- movdqa xmm1,xmm3
- paddw xmm3,xmm6 ; xmm3=tmp10+tmp11
- psubw xmm1,xmm6 ; xmm1=tmp10-tmp11
-
- psllw xmm3,PASS1_BITS ; xmm3=data0
- psllw xmm1,PASS1_BITS ; xmm1=data4
-
- movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
- movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movdqa xmm7,xmm4 ; xmm4=tmp13
- movdqa xmm6,xmm4
- punpcklwd xmm7,xmm0 ; xmm0=tmp12
- punpckhwd xmm6,xmm0
- movdqa xmm4,xmm7
- movdqa xmm0,xmm6
- pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
- pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
- pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
- pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
-
- paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm7,DESCALE_P1
- psrad xmm6,DESCALE_P1
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm4,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm7,xmm6 ; xmm7=data2
- packssdw xmm4,xmm0 ; xmm4=data6
-
- movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
- movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
-
- ; -- Odd part
-
- movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
-
- movdqa xmm6,xmm2 ; xmm2=tmp4
- movdqa xmm0,xmm5 ; xmm5=tmp5
- paddw xmm6,xmm3 ; xmm6=z3
- paddw xmm0,xmm1 ; xmm0=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm7,xmm6
- movdqa xmm4,xmm6
- punpcklwd xmm7,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm6,xmm7
- movdqa xmm0,xmm4
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
- pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
- pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
- movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movdqa xmm7,xmm2
- movdqa xmm4,xmm2
- punpcklwd xmm7,xmm1
- punpckhwd xmm4,xmm1
- movdqa xmm2,xmm7
- movdqa xmm1,xmm4
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
-
- paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
- paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
- paddd xmm2,xmm6 ; xmm2=data1L
- paddd xmm1,xmm0 ; xmm1=data1H
-
- paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm7,DESCALE_P1
- psrad xmm4,DESCALE_P1
- paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm2,DESCALE_P1
- psrad xmm1,DESCALE_P1
-
- packssdw xmm7,xmm4 ; xmm7=data7
- packssdw xmm2,xmm1 ; xmm2=data1
-
- movdqa xmm4,xmm5
- movdqa xmm1,xmm5
- punpcklwd xmm4,xmm3
- punpckhwd xmm1,xmm3
- movdqa xmm5,xmm4
- movdqa xmm3,xmm1
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
- pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
- pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
-
- paddd xmm4,xmm6 ; xmm4=data5L
- paddd xmm1,xmm0 ; xmm1=data5H
- paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
- paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
-
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm4,DESCALE_P1
- psrad xmm1,DESCALE_P1
- paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm5,DESCALE_P1
- psrad xmm3,DESCALE_P1
-
- packssdw xmm4,xmm1 ; xmm4=data5
- packssdw xmm5,xmm3 ; xmm5=data3
-
- ; ---- Pass 2: process columns.
-
-; mov edx, POINTER [data(eax)] ; (DCTELEM *)
-
- movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
- movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
-
- ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
- ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
- movdqa xmm1,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
- punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
- movdqa xmm3,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
- punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
-
- movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
- movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
-
- ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
- ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
-
- movdqa xmm0,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
- punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
- movdqa xmm3,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
- punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
-
- movdqa xmm4,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
- punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
- punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
- movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
- movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
- movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
- punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
- movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
- punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
-
- movdqa xmm5,xmm6 ; transpose coefficients(phase 3)
- punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
- punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
- movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
- punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
- movdqa xmm2,xmm5
- movdqa xmm7,xmm6
- psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6
- psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7
- paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1
- paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0
-
- movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
- movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movdqa xmm5,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
- punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
- movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
- punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
- movdqa xmm0,xmm5
- movdqa xmm3,xmm4
- paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3
- paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2
- psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4
- psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm1,xmm7
- movdqa xmm6,xmm2
- paddw xmm7,xmm5 ; xmm7=tmp10
- paddw xmm2,xmm4 ; xmm2=tmp11
- psubw xmm1,xmm5 ; xmm1=tmp13
- psubw xmm6,xmm4 ; xmm6=tmp12
-
- movdqa xmm5,xmm7
- paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
- psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
-
- paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
- paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
- psraw xmm7,PASS1_BITS ; xmm7=data0
- psraw xmm5,PASS1_BITS ; xmm5=data4
-
- movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
- movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movdqa xmm4,xmm1 ; xmm1=tmp13
- movdqa xmm2,xmm1
- punpcklwd xmm4,xmm6 ; xmm6=tmp12
- punpckhwd xmm2,xmm6
- movdqa xmm1,xmm4
- movdqa xmm6,xmm2
- pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
- pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
- pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
- pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
-
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm4,DESCALE_P2
- psrad xmm2,DESCALE_P2
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm1,DESCALE_P2
- psrad xmm6,DESCALE_P2
-
- packssdw xmm4,xmm2 ; xmm4=data2
- packssdw xmm1,xmm6 ; xmm1=data6
-
- movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
-
- ; -- Odd part
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
-
- movdqa xmm2,xmm0 ; xmm0=tmp4
- movdqa xmm6,xmm3 ; xmm3=tmp5
- paddw xmm2,xmm7 ; xmm2=z3
- paddw xmm6,xmm5 ; xmm6=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm4,xmm2
- movdqa xmm1,xmm2
- punpcklwd xmm4,xmm6
- punpckhwd xmm1,xmm6
- movdqa xmm2,xmm4
- movdqa xmm6,xmm1
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
- pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
- pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movdqa xmm4,xmm0
- movdqa xmm1,xmm0
- punpcklwd xmm4,xmm5
- punpckhwd xmm1,xmm5
- movdqa xmm0,xmm4
- movdqa xmm5,xmm1
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
- pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
-
- paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
- paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
- paddd xmm0,xmm2 ; xmm0=data1L
- paddd xmm5,xmm6 ; xmm5=data1H
-
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm4,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm0,DESCALE_P2
- psrad xmm5,DESCALE_P2
-
- packssdw xmm4,xmm1 ; xmm4=data7
- packssdw xmm0,xmm5 ; xmm0=data1
-
- movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
-
- movdqa xmm1,xmm3
- movdqa xmm5,xmm3
- punpcklwd xmm1,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm3,xmm1
- movdqa xmm7,xmm5
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
- pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
- pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
-
- paddd xmm1,xmm2 ; xmm1=data5L
- paddd xmm5,xmm6 ; xmm5=data5H
- paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
- paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
-
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm1,DESCALE_P2
- psrad xmm5,DESCALE_P2
- paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm3,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm1,xmm5 ; xmm1=data5
- packssdw xmm3,xmm7 ; xmm3=data3
-
- movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
-
-; pop edi ; unused
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctflt-3dn.asm b/media/libjpeg/simd/jidctflt-3dn.asm
deleted file mode 100644
index 99356f20a4..0000000000
--- a/media/libjpeg/simd/jidctflt-3dn.asm
+++ /dev/null
@@ -1,451 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_float_3dnow)
-
-EXTN(jconst_idct_float_3dnow):
-
-PD_1_414 times 2 dd 1.414213562373095048801689
-PD_1_847 times 2 dd 1.847759065022573512256366
-PD_1_082 times 2 dd 1.082392200292393968799446
-PD_2_613 times 2 dd 2.613125929752753055713286
-PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP times 8 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_3dnow (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 2
-%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
- ; FAST_FLOAT workspace[DCTSIZE2]
-
- align 16
- global EXTN(jsimd_idct_float_3dnow)
-
-EXTN(jsimd_idct_float_3dnow):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [workspace]
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input, store into work array.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
- lea edi, [workspace] ; FAST_FLOAT *wsptr
- mov ecx, DCTSIZE/2 ; ctr
- alignx 16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz short .columnDCT
-
- pushpic ebx ; save GOT address
- mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
- mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
- or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
- or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
- or eax,ebx
- poppic ebx ; restore GOT address
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
- punpcklwd mm0,mm0
- psrad mm0,(DWORD_BIT-WORD_BIT)
- pi2fd mm0,mm0
-
- pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
- movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
- movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
- movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
- movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
- jmp near .nextcolumn
- alignx 16,7
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
- punpcklwd mm0,mm0
- punpcklwd mm1,mm1
- psrad mm0,(DWORD_BIT-WORD_BIT)
- psrad mm1,(DWORD_BIT-WORD_BIT)
- pi2fd mm0,mm0
- pi2fd mm1,mm1
-
- pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- punpcklwd mm2,mm2
- punpcklwd mm3,mm3
- psrad mm2,(DWORD_BIT-WORD_BIT)
- psrad mm3,(DWORD_BIT-WORD_BIT)
- pi2fd mm2,mm2
- pi2fd mm3,mm3
-
- pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movq mm4,mm0
- movq mm5,mm1
- pfsub mm0,mm2 ; mm0=tmp11
- pfsub mm1,mm3
- pfadd mm4,mm2 ; mm4=tmp10
- pfadd mm5,mm3 ; mm5=tmp13
-
- pfmul mm1,[GOTOFF(ebx,PD_1_414)]
- pfsub mm1,mm5 ; mm1=tmp12
-
- movq mm6,mm4
- movq mm7,mm0
- pfsub mm4,mm5 ; mm4=tmp3
- pfsub mm0,mm1 ; mm0=tmp2
- pfadd mm6,mm5 ; mm6=tmp0
- pfadd mm7,mm1 ; mm7=tmp1
-
- movq MMWORD [wk(1)], mm4 ; tmp3
- movq MMWORD [wk(0)], mm0 ; tmp2
-
- ; -- Odd part
-
- movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
- punpcklwd mm2,mm2
- punpcklwd mm3,mm3
- psrad mm2,(DWORD_BIT-WORD_BIT)
- psrad mm3,(DWORD_BIT-WORD_BIT)
- pi2fd mm2,mm2
- pi2fd mm3,mm3
-
- pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- punpcklwd mm5,mm5
- punpcklwd mm1,mm1
- psrad mm5,(DWORD_BIT-WORD_BIT)
- psrad mm1,(DWORD_BIT-WORD_BIT)
- pi2fd mm5,mm5
- pi2fd mm1,mm1
-
- pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movq mm4,mm2
- movq mm0,mm5
- pfadd mm2,mm1 ; mm2=z11
- pfadd mm5,mm3 ; mm5=z13
- pfsub mm4,mm1 ; mm4=z12
- pfsub mm0,mm3 ; mm0=z10
-
- movq mm1,mm2
- pfsub mm2,mm5
- pfadd mm1,mm5 ; mm1=tmp7
-
- pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
-
- movq mm3,mm0
- pfadd mm0,mm4
- pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5
- pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
- pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
- pfsubr mm3,mm0 ; mm3=tmp12
- pfsub mm4,mm0 ; mm4=tmp10
-
- ; -- Final output stage
-
- pfsub mm3,mm1 ; mm3=tmp6
- movq mm5,mm6
- movq mm0,mm7
- pfadd mm6,mm1 ; mm6=data0=(00 01)
- pfadd mm7,mm3 ; mm7=data1=(10 11)
- pfsub mm5,mm1 ; mm5=data7=(70 71)
- pfsub mm0,mm3 ; mm0=data6=(60 61)
- pfsub mm2,mm3 ; mm2=tmp5
-
- movq mm1,mm6 ; transpose coefficients
- punpckldq mm6,mm7 ; mm6=(00 10)
- punpckhdq mm1,mm7 ; mm1=(01 11)
- movq mm3,mm0 ; transpose coefficients
- punpckldq mm0,mm5 ; mm0=(60 70)
- punpckhdq mm3,mm5 ; mm3=(61 71)
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
- movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
- movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
-
- movq mm7, MMWORD [wk(0)] ; mm7=tmp2
- movq mm5, MMWORD [wk(1)] ; mm5=tmp3
-
- pfadd mm4,mm2 ; mm4=tmp4
- movq mm6,mm7
- movq mm1,mm5
- pfadd mm7,mm2 ; mm7=data2=(20 21)
- pfadd mm5,mm4 ; mm5=data4=(40 41)
- pfsub mm6,mm2 ; mm6=data5=(50 51)
- pfsub mm1,mm4 ; mm1=data3=(30 31)
-
- movq mm0,mm7 ; transpose coefficients
- punpckldq mm7,mm1 ; mm7=(20 30)
- punpckhdq mm0,mm1 ; mm0=(21 31)
- movq mm3,mm5 ; transpose coefficients
- punpckldq mm5,mm6 ; mm5=(40 50)
- punpckhdq mm3,mm6 ; mm3=(41 51)
-
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
- movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
- movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
-
-.nextcolumn:
- add esi, byte 2*SIZEOF_JCOEF ; coef_block
- add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr
- add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
- dec ecx ; ctr
- jnz near .columnloop
-
- ; -- Prefetch the next coefficient block
-
- prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
- prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
- prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
- prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- lea esi, [workspace] ; FAST_FLOAT *wsptr
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
- mov ecx, DCTSIZE/2 ; ctr
- alignx 16,7
-.rowloop:
-
- ; -- Even part
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
- movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
- movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
- movq mm4,mm0
- movq mm5,mm1
- pfsub mm0,mm2 ; mm0=tmp11
- pfsub mm1,mm3
- pfadd mm4,mm2 ; mm4=tmp10
- pfadd mm5,mm3 ; mm5=tmp13
-
- pfmul mm1,[GOTOFF(ebx,PD_1_414)]
- pfsub mm1,mm5 ; mm1=tmp12
-
- movq mm6,mm4
- movq mm7,mm0
- pfsub mm4,mm5 ; mm4=tmp3
- pfsub mm0,mm1 ; mm0=tmp2
- pfadd mm6,mm5 ; mm6=tmp0
- pfadd mm7,mm1 ; mm7=tmp1
-
- movq MMWORD [wk(1)], mm4 ; tmp3
- movq MMWORD [wk(0)], mm0 ; tmp2
-
- ; -- Odd part
-
- movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
- movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
- movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
- movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
- movq mm4,mm2
- movq mm0,mm5
- pfadd mm2,mm1 ; mm2=z11
- pfadd mm5,mm3 ; mm5=z13
- pfsub mm4,mm1 ; mm4=z12
- pfsub mm0,mm3 ; mm0=z10
-
- movq mm1,mm2
- pfsub mm2,mm5
- pfadd mm1,mm5 ; mm1=tmp7
-
- pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
-
- movq mm3,mm0
- pfadd mm0,mm4
- pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5
- pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
- pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
- pfsubr mm3,mm0 ; mm3=tmp12
- pfsub mm4,mm0 ; mm4=tmp10
-
- ; -- Final output stage
-
- pfsub mm3,mm1 ; mm3=tmp6
- movq mm5,mm6
- movq mm0,mm7
- pfadd mm6,mm1 ; mm6=data0=(00 10)
- pfadd mm7,mm3 ; mm7=data1=(01 11)
- pfsub mm5,mm1 ; mm5=data7=(07 17)
- pfsub mm0,mm3 ; mm0=data6=(06 16)
- pfsub mm2,mm3 ; mm2=tmp5
-
- movq mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC]
- pcmpeqd mm3,mm3
- psrld mm3,WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
-
- pfadd mm6,mm1 ; mm6=roundint(data0/8)=(00 ** 10 **)
- pfadd mm7,mm1 ; mm7=roundint(data1/8)=(01 ** 11 **)
- pfadd mm0,mm1 ; mm0=roundint(data6/8)=(06 ** 16 **)
- pfadd mm5,mm1 ; mm5=roundint(data7/8)=(07 ** 17 **)
-
- pand mm6,mm3 ; mm6=(00 -- 10 --)
- pslld mm7,WORD_BIT ; mm7=(-- 01 -- 11)
- pand mm0,mm3 ; mm0=(06 -- 16 --)
- pslld mm5,WORD_BIT ; mm5=(-- 07 -- 17)
- por mm6,mm7 ; mm6=(00 01 10 11)
- por mm0,mm5 ; mm0=(06 07 16 17)
-
- movq mm1, MMWORD [wk(0)] ; mm1=tmp2
- movq mm3, MMWORD [wk(1)] ; mm3=tmp3
-
- pfadd mm4,mm2 ; mm4=tmp4
- movq mm7,mm1
- movq mm5,mm3
- pfadd mm1,mm2 ; mm1=data2=(02 12)
- pfadd mm3,mm4 ; mm3=data4=(04 14)
- pfsub mm7,mm2 ; mm7=data5=(05 15)
- pfsub mm5,mm4 ; mm5=data3=(03 13)
-
- movq mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC]
- pcmpeqd mm4,mm4
- psrld mm4,WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
-
- pfadd mm3,mm2 ; mm3=roundint(data4/8)=(04 ** 14 **)
- pfadd mm7,mm2 ; mm7=roundint(data5/8)=(05 ** 15 **)
- pfadd mm1,mm2 ; mm1=roundint(data2/8)=(02 ** 12 **)
- pfadd mm5,mm2 ; mm5=roundint(data3/8)=(03 ** 13 **)
-
- pand mm3,mm4 ; mm3=(04 -- 14 --)
- pslld mm7,WORD_BIT ; mm7=(-- 05 -- 15)
- pand mm1,mm4 ; mm1=(02 -- 12 --)
- pslld mm5,WORD_BIT ; mm5=(-- 03 -- 13)
- por mm3,mm7 ; mm3=(04 05 14 15)
- por mm1,mm5 ; mm1=(02 03 12 13)
-
- movq mm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP]
-
- packsswb mm6,mm3 ; mm6=(00 01 10 11 04 05 14 15)
- packsswb mm1,mm0 ; mm1=(02 03 12 13 06 07 16 17)
- paddb mm6,mm2
- paddb mm1,mm2
-
- movq mm4,mm6 ; transpose coefficients(phase 2)
- punpcklwd mm6,mm1 ; mm6=(00 01 02 03 10 11 12 13)
- punpckhwd mm4,mm1 ; mm4=(04 05 06 07 14 15 16 17)
-
- movq mm7,mm6 ; transpose coefficients(phase 3)
- punpckldq mm6,mm4 ; mm6=(00 01 02 03 04 05 06 07)
- punpckhdq mm7,mm4 ; mm7=(10 11 12 13 14 15 16 17)
-
- pushpic ebx ; save GOT address
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
- movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-
- poppic ebx ; restore GOT address
-
- add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr
- add edi, byte 2*SIZEOF_JSAMPROW
- dec ecx ; ctr
- jnz near .rowloop
-
- femms ; empty MMX/3DNow! state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctflt-sse.asm b/media/libjpeg/simd/jidctflt-sse.asm
deleted file mode 100644
index 4d4af2fffc..0000000000
--- a/media/libjpeg/simd/jidctflt-sse.asm
+++ /dev/null
@@ -1,571 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
- shufps %1,%2,0x44
-%endmacro
-
-%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
- shufps %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_float_sse)
-
-EXTN(jconst_idct_float_sse):
-
-PD_1_414 times 4 dd 1.414213562373095048801689
-PD_1_847 times 4 dd 1.847759065022573512256366
-PD_1_082 times 4 dd 1.082392200292393968799446
-PD_M2_613 times 4 dd -2.613125929752753055713286
-PD_0_125 times 4 dd 0.125 ; 1/8
-PB_CENTERJSAMP times 8 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
- ; FAST_FLOAT workspace[DCTSIZE2]
-
- align 16
- global EXTN(jsimd_idct_float_sse)
-
-EXTN(jsimd_idct_float_sse):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [workspace]
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input, store into work array.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
- lea edi, [workspace] ; FAST_FLOAT *wsptr
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por mm1,mm0
- packsswb mm1,mm1
- movd eax,mm1
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
- punpckhwd mm1,mm0 ; mm1=(** 02 ** 03)
- punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
- psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03)
- psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
- cvtpi2ps xmm3,mm1 ; xmm3=(02 03 ** **)
- cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **)
- movlhps xmm0,xmm3 ; xmm0=in0=(00 01 02 03)
-
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm1,xmm0
- movaps xmm2,xmm0
- movaps xmm3,xmm0
-
- shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
- shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
- shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
- shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
-
- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
- jmp near .nextcolumn
- alignx 16,7
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
- punpckhwd mm4,mm0 ; mm4=(** 02 ** 03)
- punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
- punpckhwd mm5,mm1 ; mm5=(** 22 ** 23)
- punpcklwd mm1,mm1 ; mm1=(20 20 21 21)
-
- psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03)
- psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
- cvtpi2ps xmm4,mm4 ; xmm4=(02 03 ** **)
- cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **)
- psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23)
- psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21)
- cvtpi2ps xmm5,mm5 ; xmm5=(22 23 ** **)
- cvtpi2ps xmm1,mm1 ; xmm1=(20 21 ** **)
-
- punpckhwd mm6,mm2 ; mm6=(** 42 ** 43)
- punpcklwd mm2,mm2 ; mm2=(40 40 41 41)
- punpckhwd mm7,mm3 ; mm7=(** 62 ** 63)
- punpcklwd mm3,mm3 ; mm3=(60 60 61 61)
-
- psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43)
- psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41)
- cvtpi2ps xmm6,mm6 ; xmm6=(42 43 ** **)
- cvtpi2ps xmm2,mm2 ; xmm2=(40 41 ** **)
- psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63)
- psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61)
- cvtpi2ps xmm7,mm7 ; xmm7=(62 63 ** **)
- cvtpi2ps xmm3,mm3 ; xmm3=(60 61 ** **)
-
- movlhps xmm0,xmm4 ; xmm0=in0=(00 01 02 03)
- movlhps xmm1,xmm5 ; xmm1=in2=(20 21 22 23)
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movlhps xmm2,xmm6 ; xmm2=in4=(40 41 42 43)
- movlhps xmm3,xmm7 ; xmm3=in6=(60 61 62 63)
- mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[GOTOFF(ebx,PD_1_414)]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
- punpckhwd mm6,mm4 ; mm6=(** 12 ** 13)
- punpcklwd mm4,mm4 ; mm4=(10 10 11 11)
- punpckhwd mm2,mm0 ; mm2=(** 32 ** 33)
- punpcklwd mm0,mm0 ; mm0=(30 30 31 31)
-
- psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13)
- psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11)
- cvtpi2ps xmm4,mm6 ; xmm4=(12 13 ** **)
- cvtpi2ps xmm2,mm4 ; xmm2=(10 11 ** **)
- psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33)
- psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31)
- cvtpi2ps xmm0,mm2 ; xmm0=(32 33 ** **)
- cvtpi2ps xmm3,mm0 ; xmm3=(30 31 ** **)
-
- punpckhwd mm7,mm5 ; mm7=(** 52 ** 53)
- punpcklwd mm5,mm5 ; mm5=(50 50 51 51)
- punpckhwd mm3,mm1 ; mm3=(** 72 ** 73)
- punpcklwd mm1,mm1 ; mm1=(70 70 71 71)
-
- movlhps xmm2,xmm4 ; xmm2=in1=(10 11 12 13)
- movlhps xmm3,xmm0 ; xmm3=in3=(30 31 32 33)
-
- psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53)
- psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51)
- cvtpi2ps xmm4,mm7 ; xmm4=(52 53 ** **)
- cvtpi2ps xmm5,mm5 ; xmm5=(50 51 ** **)
- psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73)
- psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71)
- cvtpi2ps xmm0,mm3 ; xmm0=(72 73 ** **)
- cvtpi2ps xmm1,mm1 ; xmm1=(70 71 ** **)
-
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movlhps xmm5,xmm4 ; xmm5=in5=(50 51 52 53)
- movlhps xmm1,xmm0 ; xmm1=in7=(70 71 72 73)
- mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
- mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
- addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
- subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
- subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
- unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
- movaps xmm3,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
- unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
- movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
-
- movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
- movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm0,xmm7
- movaps xmm3,xmm5
- addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
- addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
- subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
- subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
-
- movaps xmm2,xmm7 ; transpose coefficients(phase 1)
- unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
- unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
- movaps xmm4,xmm5 ; transpose coefficients(phase 1)
- unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
- unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
-
- movaps xmm3,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
- unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
- movaps xmm0,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
- unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
- movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
- movaps xmm6,xmm5 ; transpose coefficients(phase 2)
- unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
- unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
- movaps xmm3,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
- unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
- movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
- add esi, byte 4*SIZEOF_JCOEF ; coef_block
- add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
- add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
- dec ecx ; ctr
- jnz near .columnloop
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- lea esi, [workspace] ; FAST_FLOAT *wsptr
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
-.rowloop:
-
- ; -- Even part
-
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[GOTOFF(ebx,PD_1_414)]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
- mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
- addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
- subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
- subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,[GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125]
-
- mulps xmm6,xmm1 ; descale(1/8)
- mulps xmm7,xmm1 ; descale(1/8)
- mulps xmm5,xmm1 ; descale(1/8)
- mulps xmm0,xmm1 ; descale(1/8)
-
- movhlps xmm3,xmm6
- movhlps xmm1,xmm7
- cvtps2pi mm0,xmm6 ; round to int32, mm0=data0L=(00 10)
- cvtps2pi mm1,xmm7 ; round to int32, mm1=data1L=(01 11)
- cvtps2pi mm2,xmm3 ; round to int32, mm2=data0H=(20 30)
- cvtps2pi mm3,xmm1 ; round to int32, mm3=data1H=(21 31)
- packssdw mm0,mm2 ; mm0=data0=(00 10 20 30)
- packssdw mm1,mm3 ; mm1=data1=(01 11 21 31)
-
- movhlps xmm6,xmm5
- movhlps xmm7,xmm0
- cvtps2pi mm4,xmm5 ; round to int32, mm4=data7L=(07 17)
- cvtps2pi mm5,xmm0 ; round to int32, mm5=data6L=(06 16)
- cvtps2pi mm6,xmm6 ; round to int32, mm6=data7H=(27 37)
- cvtps2pi mm7,xmm7 ; round to int32, mm7=data6H=(26 36)
- packssdw mm4,mm6 ; mm4=data7=(07 17 27 37)
- packssdw mm5,mm7 ; mm5=data6=(06 16 26 36)
-
- packsswb mm0,mm5 ; mm0=(00 10 20 30 06 16 26 36)
- packsswb mm1,mm4 ; mm1=(01 11 21 31 07 17 27 37)
-
- movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2
- movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
-
- movaps xmm6,[GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125]
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm5,xmm3
- movaps xmm0,xmm1
- addps xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
- addps xmm1,xmm4 ; xmm1=data4=(04 14 24 34)
- subps xmm5,xmm2 ; xmm5=data5=(05 15 25 35)
- subps xmm0,xmm4 ; xmm0=data3=(03 13 23 33)
-
- mulps xmm3,xmm6 ; descale(1/8)
- mulps xmm1,xmm6 ; descale(1/8)
- mulps xmm5,xmm6 ; descale(1/8)
- mulps xmm0,xmm6 ; descale(1/8)
-
- movhlps xmm7,xmm3
- movhlps xmm2,xmm1
- cvtps2pi mm2,xmm3 ; round to int32, mm2=data2L=(02 12)
- cvtps2pi mm3,xmm1 ; round to int32, mm3=data4L=(04 14)
- cvtps2pi mm6,xmm7 ; round to int32, mm6=data2H=(22 32)
- cvtps2pi mm7,xmm2 ; round to int32, mm7=data4H=(24 34)
- packssdw mm2,mm6 ; mm2=data2=(02 12 22 32)
- packssdw mm3,mm7 ; mm3=data4=(04 14 24 34)
-
- movhlps xmm4,xmm5
- movhlps xmm6,xmm0
- cvtps2pi mm5,xmm5 ; round to int32, mm5=data5L=(05 15)
- cvtps2pi mm4,xmm0 ; round to int32, mm4=data3L=(03 13)
- cvtps2pi mm6,xmm4 ; round to int32, mm6=data5H=(25 35)
- cvtps2pi mm7,xmm6 ; round to int32, mm7=data3H=(23 33)
- packssdw mm5,mm6 ; mm5=data5=(05 15 25 35)
- packssdw mm4,mm7 ; mm4=data3=(03 13 23 33)
-
- movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
-
- packsswb mm2,mm3 ; mm2=(02 12 22 32 04 14 24 34)
- packsswb mm4,mm5 ; mm4=(03 13 23 33 05 15 25 35)
-
- paddb mm0,mm6
- paddb mm1,mm6
- paddb mm2,mm6
- paddb mm4,mm6
-
- movq mm7,mm0 ; transpose coefficients(phase 1)
- punpcklbw mm0,mm1 ; mm0=(00 01 10 11 20 21 30 31)
- punpckhbw mm7,mm1 ; mm7=(06 07 16 17 26 27 36 37)
- movq mm3,mm2 ; transpose coefficients(phase 1)
- punpcklbw mm2,mm4 ; mm2=(02 03 12 13 22 23 32 33)
- punpckhbw mm3,mm4 ; mm3=(04 05 14 15 24 25 34 35)
-
- movq mm5,mm0 ; transpose coefficients(phase 2)
- punpcklwd mm0,mm2 ; mm0=(00 01 02 03 10 11 12 13)
- punpckhwd mm5,mm2 ; mm5=(20 21 22 23 30 31 32 33)
- movq mm6,mm3 ; transpose coefficients(phase 2)
- punpcklwd mm3,mm7 ; mm3=(04 05 06 07 14 15 16 17)
- punpckhwd mm6,mm7 ; mm6=(24 25 26 27 34 35 36 37)
-
- movq mm1,mm0 ; transpose coefficients(phase 3)
- punpckldq mm0,mm3 ; mm0=(00 01 02 03 04 05 06 07)
- punpckhdq mm1,mm3 ; mm1=(10 11 12 13 14 15 16 17)
- movq mm4,mm5 ; transpose coefficients(phase 3)
- punpckldq mm5,mm6 ; mm5=(20 21 22 23 24 25 26 27)
- punpckhdq mm4,mm6 ; mm4=(30 31 32 33 34 35 36 37)
-
- pushpic ebx ; save GOT address
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
- movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
- mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
- movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
- poppic ebx ; restore GOT address
-
- add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
- add edi, byte 4*SIZEOF_JSAMPROW
- dec ecx ; ctr
- jnz near .rowloop
-
- emms ; empty MMX state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctflt-sse2-64.asm b/media/libjpeg/simd/jidctflt-sse2-64.asm
deleted file mode 100644
index bdda05d97c..0000000000
--- a/media/libjpeg/simd/jidctflt-sse2-64.asm
+++ /dev/null
@@ -1,482 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
- shufps %1,%2,0x44
-%endmacro
-
-%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
- shufps %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_float_sse2)
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414 times 4 dd 1.414213562373095048801689
-PD_1_847 times 4 dd 1.847759065022573512256366
-PD_1_082 times 4 dd 1.082392200292393968799446
-PD_M2_613 times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp rbp+0
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
- ; FAST_FLOAT workspace[DCTSIZE2]
-
- align 16
- global EXTN(jsimd_idct_float_sse2)
-
-EXTN(jsimd_idct_float_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [workspace]
- collect_args
- push rbx
-
- ; ---- Pass 1: process columns from input, store into work array.
-
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
- lea rdi, [workspace] ; FAST_FLOAT *wsptr
- mov rcx, DCTSIZE/4 ; ctr
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- por xmm1,xmm2
- por xmm3,xmm4
- por xmm5,xmm6
- por xmm1,xmm3
- por xmm5,xmm7
- por xmm1,xmm5
- packsswb xmm1,xmm1
- movd eax,xmm1
- test rax,rax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
- cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
-
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm1,xmm0
- movaps xmm2,xmm0
- movaps xmm3,xmm0
-
- shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
- shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
- shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
- shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
-
- movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
- jmp near .nextcolumn
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
- psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
- psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
- cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
- cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
-
- punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
- punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
- psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
- psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
- cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
- cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
-
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[rel PD_1_414]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-
- punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
- punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
- psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
- psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
- cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
- cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
-
- punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
- punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
- psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
- psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
- cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
- cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
-
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[rel PD_1_847] ; xmm0=z5
- mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
- addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
- subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
- subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
- unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
- movaps xmm3,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
- unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
- movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
-
- movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
- movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm0,xmm7
- movaps xmm3,xmm5
- addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
- addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
- subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
- subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
-
- movaps xmm2,xmm7 ; transpose coefficients(phase 1)
- unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
- unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
- movaps xmm4,xmm5 ; transpose coefficients(phase 1)
- unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
- unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
-
- movaps xmm3,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
- unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
- movaps xmm0,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
- unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
- movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-
- movaps xmm6,xmm5 ; transpose coefficients(phase 2)
- unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
- unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
- movaps xmm3,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
- unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
- movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
- add rsi, byte 4*SIZEOF_JCOEF ; coef_block
- add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
- add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
- dec rcx ; ctr
- jnz near .columnloop
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov rax, [original_rbp]
- lea rsi, [workspace] ; FAST_FLOAT *wsptr
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
- mov rcx, DCTSIZE/4 ; ctr
-.rowloop:
-
- ; -- Even part
-
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[rel PD_1_414]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[rel PD_1_847] ; xmm0=z5
- mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
- addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
- subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
- subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
- pcmpeqd xmm3,xmm3
- psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
- addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
- addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
- addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
- addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
- pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
- pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
- pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
- pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
- por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
- por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
-
- movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm7,xmm1
- movaps xmm5,xmm3
- addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
- addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
- subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
- subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
-
- movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
- pcmpeqd xmm4,xmm4
- psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
- addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
- addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
- addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
- addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
- pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
- pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
- pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
- pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
- por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
- por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
-
- movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
-
- packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
- packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
- paddb xmm6,xmm2
- paddb xmm1,xmm2
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
- punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
- punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
- pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
- mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
- movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
-
- add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
- add rdi, byte 4*SIZEOF_JSAMPROW
- dec rcx ; ctr
- jnz near .rowloop
-
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctflt-sse2.asm b/media/libjpeg/simd/jidctflt-sse2.asm
deleted file mode 100644
index a15a9c1111..0000000000
--- a/media/libjpeg/simd/jidctflt-sse2.asm
+++ /dev/null
@@ -1,497 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
- shufps %1,%2,0x44
-%endmacro
-
-%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
- shufps %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_float_sse2)
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414 times 4 dd 1.414213562373095048801689
-PD_1_847 times 4 dd 1.847759065022573512256366
-PD_1_082 times 4 dd 1.082392200292393968799446
-PD_M2_613 times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
- ; FAST_FLOAT workspace[DCTSIZE2]
-
- align 16
- global EXTN(jsimd_idct_float_sse2)
-
-EXTN(jsimd_idct_float_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [workspace]
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input, store into work array.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
- lea edi, [workspace] ; FAST_FLOAT *wsptr
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por xmm1,xmm2
- por xmm3,xmm4
- por xmm5,xmm6
- por xmm1,xmm3
- por xmm5,xmm7
- por xmm1,xmm5
- packsswb xmm1,xmm1
- movd eax,xmm1
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
- cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
-
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm1,xmm0
- movaps xmm2,xmm0
- movaps xmm3,xmm0
-
- shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
- shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
- shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
- shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
-
- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
- jmp near .nextcolumn
- alignx 16,7
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
- psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
- psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
- cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
- cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
-
- punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
- punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
- psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
- psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
- cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
- cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
-
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[GOTOFF(ebx,PD_1_414)]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
- punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
- punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
- psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
- psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
- cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
- cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
-
- punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
- punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
- psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
- psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
- cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
- cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
-
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
- mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
- addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
- subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
- subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
- unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
- movaps xmm3,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
- unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
- movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
-
- movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
- movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm0,xmm7
- movaps xmm3,xmm5
- addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
- addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
- subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
- subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
-
- movaps xmm2,xmm7 ; transpose coefficients(phase 1)
- unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
- unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
- movaps xmm4,xmm5 ; transpose coefficients(phase 1)
- unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
- unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
-
- movaps xmm3,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
- unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
- movaps xmm0,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
- unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
- movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
- movaps xmm6,xmm5 ; transpose coefficients(phase 2)
- unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
- unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
- movaps xmm3,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
- unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
- movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
- add esi, byte 4*SIZEOF_JCOEF ; coef_block
- add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
- add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
- dec ecx ; ctr
- jnz near .columnloop
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- lea esi, [workspace] ; FAST_FLOAT *wsptr
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
-.rowloop:
-
- ; -- Even part
-
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[GOTOFF(ebx,PD_1_414)]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
- mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
- addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
- subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
- subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
- pcmpeqd xmm3,xmm3
- psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
- addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
- addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
- addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
- addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
- pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
- pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
- pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
- pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
- por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
- por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
-
- movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm7,xmm1
- movaps xmm5,xmm3
- addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
- addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
- subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
- subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
-
- movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
- pcmpeqd xmm4,xmm4
- psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
- addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
- addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
- addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
- addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
- pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
- pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
- pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
- pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
- por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
- por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
-
- movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
-
- packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
- packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
- paddb xmm6,xmm2
- paddb xmm1,xmm2
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
- punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
- punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
- pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
- pushpic ebx ; save GOT address
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
- mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
- movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
-
- poppic ebx ; restore GOT address
-
- add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
- add edi, byte 4*SIZEOF_JSAMPROW
- dec ecx ; ctr
- jnz near .rowloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctfst-mmx.asm b/media/libjpeg/simd/jidctfst-mmx.asm
deleted file mode 100644
index 6e95bfbcaf..0000000000
--- a/media/libjpeg/simd/jidctfst-mmx.asm
+++ /dev/null
@@ -1,499 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 8 ; 14 is also OK.
-%define PASS1_BITS 2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ 277 ; FIX(1.082392200)
-F_1_414 equ 362 ; FIX(1.414213562)
-F_1_847 equ 473 ; FIX(1.847759065)
-F_2_613 equ 669 ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
-F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
- alignz 16
- global EXTN(jconst_idct_ifast_mmx)
-
-EXTN(jconst_idct_ifast_mmx):
-
-PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
-PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
-PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
-PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP times 8 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 2
-%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
- ; JCOEF workspace[DCTSIZE2]
-
- align 16
- global EXTN(jsimd_idct_ifast_mmx)
-
-EXTN(jsimd_idct_ifast_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [workspace]
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input, store into work array.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
- lea edi, [workspace] ; JCOEF *wsptr
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz short .columnDCT
-
- movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por mm1,mm0
- packsswb mm1,mm1
- movd eax,mm1
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
- movq mm2,mm0 ; mm0=in0=(00 01 02 03)
- punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
- punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
-
- movq mm1,mm0
- punpckldq mm0,mm0 ; mm0=(00 00 00 00)
- punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
- movq mm3,mm2
- punpckldq mm2,mm2 ; mm2=(02 02 02 02)
- punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
- movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
- movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
- movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
- movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
- jmp near .nextcolumn
- alignx 16,7
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
- movq mm4,mm0
- movq mm5,mm1
- psubw mm0,mm2 ; mm0=tmp11
- psubw mm1,mm3
- paddw mm4,mm2 ; mm4=tmp10
- paddw mm5,mm3 ; mm5=tmp13
-
- psllw mm1,PRE_MULTIPLY_SCALE_BITS
- pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
- psubw mm1,mm5 ; mm1=tmp12
-
- movq mm6,mm4
- movq mm7,mm0
- psubw mm4,mm5 ; mm4=tmp3
- psubw mm0,mm1 ; mm0=tmp2
- paddw mm6,mm5 ; mm6=tmp0
- paddw mm7,mm1 ; mm7=tmp1
-
- movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
- movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
-
- ; -- Odd part
-
- movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
- movq mm4,mm2
- movq mm0,mm5
- psubw mm2,mm1 ; mm2=z12
- psubw mm5,mm3 ; mm5=z10
- paddw mm4,mm1 ; mm4=z11
- paddw mm0,mm3 ; mm0=z13
-
- movq mm1,mm5 ; mm1=z10(unscaled)
- psllw mm2,PRE_MULTIPLY_SCALE_BITS
- psllw mm5,PRE_MULTIPLY_SCALE_BITS
-
- movq mm3,mm4
- psubw mm4,mm0
- paddw mm3,mm0 ; mm3=tmp7
-
- psllw mm4,PRE_MULTIPLY_SCALE_BITS
- pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movq mm0,mm5
- paddw mm5,mm2
- pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
- pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
- pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
- psubw mm0,mm1
- psubw mm2,mm5 ; mm2=tmp10
- paddw mm0,mm5 ; mm0=tmp12
-
- ; -- Final output stage
-
- psubw mm0,mm3 ; mm0=tmp6
- movq mm1,mm6
- movq mm5,mm7
- paddw mm6,mm3 ; mm6=data0=(00 01 02 03)
- paddw mm7,mm0 ; mm7=data1=(10 11 12 13)
- psubw mm1,mm3 ; mm1=data7=(70 71 72 73)
- psubw mm5,mm0 ; mm5=data6=(60 61 62 63)
- psubw mm4,mm0 ; mm4=tmp5
-
- movq mm3,mm6 ; transpose coefficients(phase 1)
- punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
- punpckhwd mm3,mm7 ; mm3=(02 12 03 13)
- movq mm0,mm5 ; transpose coefficients(phase 1)
- punpcklwd mm5,mm1 ; mm5=(60 70 61 71)
- punpckhwd mm0,mm1 ; mm0=(62 72 63 73)
-
- movq mm7, MMWORD [wk(0)] ; mm7=tmp2
- movq mm1, MMWORD [wk(1)] ; mm1=tmp3
-
- movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
- movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
-
- paddw mm2,mm4 ; mm2=tmp4
- movq mm5,mm7
- movq mm0,mm1
- paddw mm7,mm4 ; mm7=data2=(20 21 22 23)
- paddw mm1,mm2 ; mm1=data4=(40 41 42 43)
- psubw mm5,mm4 ; mm5=data5=(50 51 52 53)
- psubw mm0,mm2 ; mm0=data3=(30 31 32 33)
-
- movq mm4,mm7 ; transpose coefficients(phase 1)
- punpcklwd mm7,mm0 ; mm7=(20 30 21 31)
- punpckhwd mm4,mm0 ; mm4=(22 32 23 33)
- movq mm2,mm1 ; transpose coefficients(phase 1)
- punpcklwd mm1,mm5 ; mm1=(40 50 41 51)
- punpckhwd mm2,mm5 ; mm2=(42 52 43 53)
-
- movq mm0,mm6 ; transpose coefficients(phase 2)
- punpckldq mm6,mm7 ; mm6=(00 10 20 30)
- punpckhdq mm0,mm7 ; mm0=(01 11 21 31)
- movq mm5,mm3 ; transpose coefficients(phase 2)
- punpckldq mm3,mm4 ; mm3=(02 12 22 32)
- punpckhdq mm5,mm4 ; mm5=(03 13 23 33)
-
- movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
- movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
- movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
- movq mm6,mm1 ; transpose coefficients(phase 2)
- punpckldq mm1,mm7 ; mm1=(40 50 60 70)
- punpckhdq mm6,mm7 ; mm6=(41 51 61 71)
- movq mm0,mm2 ; transpose coefficients(phase 2)
- punpckldq mm2,mm4 ; mm2=(42 52 62 72)
- punpckhdq mm0,mm4 ; mm0=(43 53 63 73)
-
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
- movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
- movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
-
-.nextcolumn:
- add esi, byte 4*SIZEOF_JCOEF ; coef_block
- add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
- add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
- dec ecx ; ctr
- jnz near .columnloop
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- lea esi, [workspace] ; JCOEF *wsptr
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
-.rowloop:
-
- ; -- Even part
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
- movq mm4,mm0
- movq mm5,mm1
- psubw mm0,mm2 ; mm0=tmp11
- psubw mm1,mm3
- paddw mm4,mm2 ; mm4=tmp10
- paddw mm5,mm3 ; mm5=tmp13
-
- psllw mm1,PRE_MULTIPLY_SCALE_BITS
- pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
- psubw mm1,mm5 ; mm1=tmp12
-
- movq mm6,mm4
- movq mm7,mm0
- psubw mm4,mm5 ; mm4=tmp3
- psubw mm0,mm1 ; mm0=tmp2
- paddw mm6,mm5 ; mm6=tmp0
- paddw mm7,mm1 ; mm7=tmp1
-
- movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
- movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
-
- ; -- Odd part
-
- movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
- movq mm4,mm2
- movq mm0,mm5
- psubw mm2,mm1 ; mm2=z12
- psubw mm5,mm3 ; mm5=z10
- paddw mm4,mm1 ; mm4=z11
- paddw mm0,mm3 ; mm0=z13
-
- movq mm1,mm5 ; mm1=z10(unscaled)
- psllw mm2,PRE_MULTIPLY_SCALE_BITS
- psllw mm5,PRE_MULTIPLY_SCALE_BITS
-
- movq mm3,mm4
- psubw mm4,mm0
- paddw mm3,mm0 ; mm3=tmp7
-
- psllw mm4,PRE_MULTIPLY_SCALE_BITS
- pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movq mm0,mm5
- paddw mm5,mm2
- pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
- pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
- pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
- psubw mm0,mm1
- psubw mm2,mm5 ; mm2=tmp10
- paddw mm0,mm5 ; mm0=tmp12
-
- ; -- Final output stage
-
- psubw mm0,mm3 ; mm0=tmp6
- movq mm1,mm6
- movq mm5,mm7
- paddw mm6,mm3 ; mm6=data0=(00 10 20 30)
- paddw mm7,mm0 ; mm7=data1=(01 11 21 31)
- psraw mm6,(PASS1_BITS+3) ; descale
- psraw mm7,(PASS1_BITS+3) ; descale
- psubw mm1,mm3 ; mm1=data7=(07 17 27 37)
- psubw mm5,mm0 ; mm5=data6=(06 16 26 36)
- psraw mm1,(PASS1_BITS+3) ; descale
- psraw mm5,(PASS1_BITS+3) ; descale
- psubw mm4,mm0 ; mm4=tmp5
-
- packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36)
- packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37)
-
- movq mm3, MMWORD [wk(0)] ; mm3=tmp2
- movq mm0, MMWORD [wk(1)] ; mm0=tmp3
-
- paddw mm2,mm4 ; mm2=tmp4
- movq mm5,mm3
- movq mm1,mm0
- paddw mm3,mm4 ; mm3=data2=(02 12 22 32)
- paddw mm0,mm2 ; mm0=data4=(04 14 24 34)
- psraw mm3,(PASS1_BITS+3) ; descale
- psraw mm0,(PASS1_BITS+3) ; descale
- psubw mm5,mm4 ; mm5=data5=(05 15 25 35)
- psubw mm1,mm2 ; mm1=data3=(03 13 23 33)
- psraw mm5,(PASS1_BITS+3) ; descale
- psraw mm1,(PASS1_BITS+3) ; descale
-
- movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
-
- packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34)
- packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35)
-
- paddb mm6,mm4
- paddb mm7,mm4
- paddb mm3,mm4
- paddb mm1,mm4
-
- movq mm2,mm6 ; transpose coefficients(phase 1)
- punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31)
- punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37)
- movq mm0,mm3 ; transpose coefficients(phase 1)
- punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33)
- punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35)
-
- movq mm5,mm6 ; transpose coefficients(phase 2)
- punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13)
- punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33)
- movq mm4,mm0 ; transpose coefficients(phase 2)
- punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17)
- punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37)
-
- movq mm7,mm6 ; transpose coefficients(phase 3)
- punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07)
- punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17)
- movq mm1,mm5 ; transpose coefficients(phase 3)
- punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27)
- punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37)
-
- pushpic ebx ; save GOT address
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
- movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
- mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
- movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-
- poppic ebx ; restore GOT address
-
- add esi, byte 4*SIZEOF_JCOEF ; wsptr
- add edi, byte 4*SIZEOF_JSAMPROW
- dec ecx ; ctr
- jnz near .rowloop
-
- emms ; empty MMX state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctfst-sse2-64.asm b/media/libjpeg/simd/jidctfst-sse2-64.asm
deleted file mode 100644
index 48846426d2..0000000000
--- a/media/libjpeg/simd/jidctfst-sse2-64.asm
+++ /dev/null
@@ -1,491 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 8 ; 14 is also OK.
-%define PASS1_BITS 2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ 277 ; FIX(1.082392200)
-F_1_414 equ 362 ; FIX(1.414213562)
-F_1_847 equ 473 ; FIX(1.847759065)
-F_2_613 equ 669 ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
-F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
- alignz 16
- global EXTN(jconst_idct_ifast_sse2)
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
-PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
-PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info *compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp rbp+0
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_idct_ifast_sse2)
-
-EXTN(jsimd_idct_ifast_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
-
- ; ---- Pass 1: process columns from input.
-
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- por xmm1,xmm0
- packsswb xmm1,xmm1
- packsswb xmm1,xmm1
- movd eax,xmm1
- test rax,rax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
-
- pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
- pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
- pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
- pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
- pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
- pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
- pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
- pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
- jmp near .column_end
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
- movdqa xmm4,xmm0
- movdqa xmm5,xmm1
- psubw xmm0,xmm2 ; xmm0=tmp11
- psubw xmm1,xmm3
- paddw xmm4,xmm2 ; xmm4=tmp10
- paddw xmm5,xmm3 ; xmm5=tmp13
-
- psllw xmm1,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm1,[rel PW_F1414]
- psubw xmm1,xmm5 ; xmm1=tmp12
-
- movdqa xmm6,xmm4
- movdqa xmm7,xmm0
- psubw xmm4,xmm5 ; xmm4=tmp3
- psubw xmm0,xmm1 ; xmm0=tmp2
- paddw xmm6,xmm5 ; xmm6=tmp0
- paddw xmm7,xmm1 ; xmm7=tmp1
-
- movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
-
- ; -- Odd part
-
- movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
- movdqa xmm4,xmm2
- movdqa xmm0,xmm5
- psubw xmm2,xmm1 ; xmm2=z12
- psubw xmm5,xmm3 ; xmm5=z10
- paddw xmm4,xmm1 ; xmm4=z11
- paddw xmm0,xmm3 ; xmm0=z13
-
- movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
-
- movdqa xmm3,xmm4
- psubw xmm4,xmm0
- paddw xmm3,xmm0 ; xmm3=tmp7
-
- psllw xmm4,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movdqa xmm0,xmm5
- paddw xmm5,xmm2
- pmulhw xmm5,[rel PW_F1847] ; xmm5=z5
- pmulhw xmm0,[rel PW_MF1613]
- pmulhw xmm2,[rel PW_F1082]
- psubw xmm0,xmm1
- psubw xmm2,xmm5 ; xmm2=tmp10
- paddw xmm0,xmm5 ; xmm0=tmp12
-
- ; -- Final output stage
-
- psubw xmm0,xmm3 ; xmm0=tmp6
- movdqa xmm1,xmm6
- movdqa xmm5,xmm7
- paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
- paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
- psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
- psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
- psubw xmm4,xmm0 ; xmm4=tmp5
-
- movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
- punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
- movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
- punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
-
- paddw xmm2,xmm4 ; xmm2=tmp4
- movdqa xmm5,xmm7
- movdqa xmm0,xmm1
- paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
- paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
- psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
- psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
- punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
- movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
-
- movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
- punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
- punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
- movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
- punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
-
- movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
- movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
-
- movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
-
- movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
- punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
- movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
- punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
- punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
- movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
- punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
- punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
- movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
- punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
- movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
- punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
- punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov rax, [original_rbp]
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
-
- ; -- Even part
-
- ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
- movdqa xmm2,xmm6
- movdqa xmm0,xmm5
- psubw xmm6,xmm1 ; xmm6=tmp11
- psubw xmm5,xmm3
- paddw xmm2,xmm1 ; xmm2=tmp10
- paddw xmm0,xmm3 ; xmm0=tmp13
-
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[rel PW_F1414]
- psubw xmm5,xmm0 ; xmm5=tmp12
-
- movdqa xmm1,xmm2
- movdqa xmm3,xmm6
- psubw xmm2,xmm0 ; xmm2=tmp3
- psubw xmm6,xmm5 ; xmm6=tmp2
- paddw xmm1,xmm0 ; xmm1=tmp0
- paddw xmm3,xmm5 ; xmm3=tmp1
-
- movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
-
- ; -- Odd part
-
- ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
- movdqa xmm2,xmm0
- movdqa xmm6,xmm4
- psubw xmm0,xmm7 ; xmm0=z12
- psubw xmm4,xmm5 ; xmm4=z10
- paddw xmm2,xmm7 ; xmm2=z11
- paddw xmm6,xmm5 ; xmm6=z13
-
- movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
- psllw xmm4,PRE_MULTIPLY_SCALE_BITS
-
- movdqa xmm5,xmm2
- psubw xmm2,xmm6
- paddw xmm5,xmm6 ; xmm5=tmp7
-
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movdqa xmm6,xmm4
- paddw xmm4,xmm0
- pmulhw xmm4,[rel PW_F1847] ; xmm4=z5
- pmulhw xmm6,[rel PW_MF1613]
- pmulhw xmm0,[rel PW_F1082]
- psubw xmm6,xmm7
- psubw xmm0,xmm4 ; xmm0=tmp10
- paddw xmm6,xmm4 ; xmm6=tmp12
-
- ; -- Final output stage
-
- psubw xmm6,xmm5 ; xmm6=tmp6
- movdqa xmm7,xmm1
- movdqa xmm4,xmm3
- paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
- paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
- psraw xmm1,(PASS1_BITS+3) ; descale
- psraw xmm3,(PASS1_BITS+3) ; descale
- psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
- psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
- psraw xmm7,(PASS1_BITS+3) ; descale
- psraw xmm4,(PASS1_BITS+3) ; descale
- psubw xmm2,xmm6 ; xmm2=tmp5
-
- packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
- movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
-
- paddw xmm0,xmm2 ; xmm0=tmp4
- movdqa xmm4,xmm5
- movdqa xmm7,xmm6
- paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
- paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
- psraw xmm5,(PASS1_BITS+3) ; descale
- psraw xmm6,(PASS1_BITS+3) ; descale
- psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
- psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
- psraw xmm4,(PASS1_BITS+3) ; descale
- psraw xmm7,(PASS1_BITS+3) ; descale
-
- movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
-
- packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
- packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
- paddb xmm1,xmm2
- paddb xmm3,xmm2
- paddb xmm5,xmm2
- paddb xmm7,xmm2
-
- movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
- punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
- punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
- movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
- punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
- punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
- punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
- movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
- punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
- punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
- movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
- punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
- movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
- punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
- punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
- pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
- pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
- pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
- mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
-
- mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
- mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctfst-sse2.asm b/media/libjpeg/simd/jidctfst-sse2.asm
deleted file mode 100644
index f591e55f0f..0000000000
--- a/media/libjpeg/simd/jidctfst-sse2.asm
+++ /dev/null
@@ -1,501 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 8 ; 14 is also OK.
-%define PASS1_BITS 2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ 277 ; FIX(1.082392200)
-F_1_414 equ 362 ; FIX(1.414213562)
-F_1_847 equ 473 ; FIX(1.847759065)
-F_2_613 equ 669 ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
-F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
- alignz 16
- global EXTN(jconst_idct_ifast_sse2)
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
-PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
-PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_idct_ifast_sse2)
-
-EXTN(jsimd_idct_ifast_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por xmm1,xmm0
- packsswb xmm1,xmm1
- packsswb xmm1,xmm1
- movd eax,xmm1
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
-
- pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
- pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
- pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
- pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
- pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
- pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
- pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
- pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
- jmp near .column_end
- alignx 16,7
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
- movdqa xmm4,xmm0
- movdqa xmm5,xmm1
- psubw xmm0,xmm2 ; xmm0=tmp11
- psubw xmm1,xmm3
- paddw xmm4,xmm2 ; xmm4=tmp10
- paddw xmm5,xmm3 ; xmm5=tmp13
-
- psllw xmm1,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm1,[GOTOFF(ebx,PW_F1414)]
- psubw xmm1,xmm5 ; xmm1=tmp12
-
- movdqa xmm6,xmm4
- movdqa xmm7,xmm0
- psubw xmm4,xmm5 ; xmm4=tmp3
- psubw xmm0,xmm1 ; xmm0=tmp2
- paddw xmm6,xmm5 ; xmm6=tmp0
- paddw xmm7,xmm1 ; xmm7=tmp1
-
- movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
-
- ; -- Odd part
-
- movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
- movdqa xmm4,xmm2
- movdqa xmm0,xmm5
- psubw xmm2,xmm1 ; xmm2=z12
- psubw xmm5,xmm3 ; xmm5=z10
- paddw xmm4,xmm1 ; xmm4=z11
- paddw xmm0,xmm3 ; xmm0=z13
-
- movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
-
- movdqa xmm3,xmm4
- psubw xmm4,xmm0
- paddw xmm3,xmm0 ; xmm3=tmp7
-
- psllw xmm4,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movdqa xmm0,xmm5
- paddw xmm5,xmm2
- pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5
- pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)]
- pmulhw xmm2,[GOTOFF(ebx,PW_F1082)]
- psubw xmm0,xmm1
- psubw xmm2,xmm5 ; xmm2=tmp10
- paddw xmm0,xmm5 ; xmm0=tmp12
-
- ; -- Final output stage
-
- psubw xmm0,xmm3 ; xmm0=tmp6
- movdqa xmm1,xmm6
- movdqa xmm5,xmm7
- paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
- paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
- psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
- psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
- psubw xmm4,xmm0 ; xmm4=tmp5
-
- movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
- punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
- movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
- punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
-
- paddw xmm2,xmm4 ; xmm2=tmp4
- movdqa xmm5,xmm7
- movdqa xmm0,xmm1
- paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
- paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
- psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
- psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
- punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
- movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
-
- movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
- punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
- punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
- movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
- punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
-
- movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
- movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
-
- movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
-
- movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
- punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
- movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
- punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
- punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
- movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
- punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
- punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
- movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
- punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
- movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
- punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
- punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
-
- ; -- Even part
-
- ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
- movdqa xmm2,xmm6
- movdqa xmm0,xmm5
- psubw xmm6,xmm1 ; xmm6=tmp11
- psubw xmm5,xmm3
- paddw xmm2,xmm1 ; xmm2=tmp10
- paddw xmm0,xmm3 ; xmm0=tmp13
-
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[GOTOFF(ebx,PW_F1414)]
- psubw xmm5,xmm0 ; xmm5=tmp12
-
- movdqa xmm1,xmm2
- movdqa xmm3,xmm6
- psubw xmm2,xmm0 ; xmm2=tmp3
- psubw xmm6,xmm5 ; xmm6=tmp2
- paddw xmm1,xmm0 ; xmm1=tmp0
- paddw xmm3,xmm5 ; xmm3=tmp1
-
- movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
-
- ; -- Odd part
-
- ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
- movdqa xmm2,xmm0
- movdqa xmm6,xmm4
- psubw xmm0,xmm7 ; xmm0=z12
- psubw xmm4,xmm5 ; xmm4=z10
- paddw xmm2,xmm7 ; xmm2=z11
- paddw xmm6,xmm5 ; xmm6=z13
-
- movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
- psllw xmm4,PRE_MULTIPLY_SCALE_BITS
-
- movdqa xmm5,xmm2
- psubw xmm2,xmm6
- paddw xmm5,xmm6 ; xmm5=tmp7
-
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movdqa xmm6,xmm4
- paddw xmm4,xmm0
- pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5
- pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)]
- pmulhw xmm0,[GOTOFF(ebx,PW_F1082)]
- psubw xmm6,xmm7
- psubw xmm0,xmm4 ; xmm0=tmp10
- paddw xmm6,xmm4 ; xmm6=tmp12
-
- ; -- Final output stage
-
- psubw xmm6,xmm5 ; xmm6=tmp6
- movdqa xmm7,xmm1
- movdqa xmm4,xmm3
- paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
- paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
- psraw xmm1,(PASS1_BITS+3) ; descale
- psraw xmm3,(PASS1_BITS+3) ; descale
- psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
- psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
- psraw xmm7,(PASS1_BITS+3) ; descale
- psraw xmm4,(PASS1_BITS+3) ; descale
- psubw xmm2,xmm6 ; xmm2=tmp5
-
- packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
- movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
-
- paddw xmm0,xmm2 ; xmm0=tmp4
- movdqa xmm4,xmm5
- movdqa xmm7,xmm6
- paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
- paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
- psraw xmm5,(PASS1_BITS+3) ; descale
- psraw xmm6,(PASS1_BITS+3) ; descale
- psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
- psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
- psraw xmm4,(PASS1_BITS+3) ; descale
- psraw xmm7,(PASS1_BITS+3) ; descale
-
- movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
-
- packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
- packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
- paddb xmm1,xmm2
- paddb xmm3,xmm2
- paddb xmm5,xmm2
- paddb xmm7,xmm2
-
- movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
- punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
- punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
- movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
- punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
- punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
- punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
- movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
- punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
- punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
- movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
- punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
- movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
- punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
- punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
- pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
- pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
- pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
- mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
-
- mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
- mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctint-mmx.asm b/media/libjpeg/simd/jidctint-mmx.asm
deleted file mode 100644
index 5bd198120b..0000000000
--- a/media/libjpeg/simd/jidctint-mmx.asm
+++ /dev/null
@@ -1,851 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 13
-%define PASS1_BITS 2
-
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_islow_mmx)
-
-EXTN(jconst_idct_islow_mmx):
-
-PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1)
-PB_CENTERJSAMP times 8 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_mmx (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 12
-%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
- ; JCOEF workspace[DCTSIZE2]
-
- align 16
- global EXTN(jsimd_idct_islow_mmx)
-
-EXTN(jsimd_idct_islow_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [workspace]
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input, store into work array.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
- lea edi, [workspace] ; JCOEF *wsptr
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz short .columnDCT
-
- movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por mm1,mm0
- packsswb mm1,mm1
- movd eax,mm1
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw mm0,PASS1_BITS
-
- movq mm2,mm0 ; mm0=in0=(00 01 02 03)
- punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
- punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
-
- movq mm1,mm0
- punpckldq mm0,mm0 ; mm0=(00 00 00 00)
- punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
- movq mm3,mm2
- punpckldq mm2,mm2 ; mm2=(02 02 02 02)
- punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
- movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
- movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
- movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
- movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
- jmp near .nextcolumn
- alignx 16,7
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movq mm4,mm1 ; mm1=in2=z2
- movq mm5,mm1
- punpcklwd mm4,mm3 ; mm3=in6=z3
- punpckhwd mm5,mm3
- movq mm1,mm4
- movq mm3,mm5
- pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
- pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
- pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
- pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
-
- movq mm6,mm0
- paddw mm0,mm2 ; mm0=in0+in4
- psubw mm6,mm2 ; mm6=in0-in4
-
- pxor mm7,mm7
- pxor mm2,mm2
- punpcklwd mm7,mm0 ; mm7=tmp0L
- punpckhwd mm2,mm0 ; mm2=tmp0H
- psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
- psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
-
- movq mm0,mm7
- paddd mm7,mm4 ; mm7=tmp10L
- psubd mm0,mm4 ; mm0=tmp13L
- movq mm4,mm2
- paddd mm2,mm5 ; mm2=tmp10H
- psubd mm4,mm5 ; mm4=tmp13H
-
- movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
- movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
- movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
- movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
-
- pxor mm5,mm5
- pxor mm7,mm7
- punpcklwd mm5,mm6 ; mm5=tmp1L
- punpckhwd mm7,mm6 ; mm7=tmp1H
- psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
- psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
-
- movq mm2,mm5
- paddd mm5,mm1 ; mm5=tmp11L
- psubd mm2,mm1 ; mm2=tmp12L
- movq mm0,mm7
- paddd mm7,mm3 ; mm7=tmp11H
- psubd mm0,mm3 ; mm0=tmp12H
-
- movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
- movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
- movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
- movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movq mm5,mm6
- movq mm7,mm4
- paddw mm5,mm3 ; mm5=z3
- paddw mm7,mm1 ; mm7=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movq mm2,mm5
- movq mm0,mm5
- punpcklwd mm2,mm7
- punpckhwd mm0,mm7
- movq mm5,mm2
- movq mm7,mm0
- pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
- pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
- pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
- pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
-
- movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
- movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movq mm2,mm3
- movq mm0,mm3
- punpcklwd mm2,mm4
- punpckhwd mm0,mm4
- movq mm3,mm2
- movq mm4,mm0
- pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
- pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
- pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
- pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
-
- paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
- paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
- paddd mm3,mm5 ; mm3=tmp3L
- paddd mm4,mm7 ; mm4=tmp3H
-
- movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
- movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
-
- movq mm2,mm1
- movq mm0,mm1
- punpcklwd mm2,mm6
- punpckhwd mm0,mm6
- movq mm1,mm2
- movq mm6,mm0
- pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
- pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
- pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
- pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
-
- paddd mm2,mm5 ; mm2=tmp1L
- paddd mm0,mm7 ; mm0=tmp1H
- paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
- paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
-
- movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
- movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
- movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
-
- movq mm2,mm5
- movq mm0,mm7
- paddd mm5,mm3 ; mm5=data0L
- paddd mm7,mm4 ; mm7=data0H
- psubd mm2,mm3 ; mm2=data7L
- psubd mm0,mm4 ; mm0=data7H
-
- movq mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
-
- paddd mm5,mm3
- paddd mm7,mm3
- psrad mm5,DESCALE_P1
- psrad mm7,DESCALE_P1
- paddd mm2,mm3
- paddd mm0,mm3
- psrad mm2,DESCALE_P1
- psrad mm0,DESCALE_P1
-
- packssdw mm5,mm7 ; mm5=data0=(00 01 02 03)
- packssdw mm2,mm0 ; mm2=data7=(70 71 72 73)
-
- movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
- movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
-
- movq mm7,mm4
- movq mm0,mm3
- paddd mm4,mm1 ; mm4=data1L
- paddd mm3,mm6 ; mm3=data1H
- psubd mm7,mm1 ; mm7=data6L
- psubd mm0,mm6 ; mm0=data6H
-
- movq mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
-
- paddd mm4,mm1
- paddd mm3,mm1
- psrad mm4,DESCALE_P1
- psrad mm3,DESCALE_P1
- paddd mm7,mm1
- paddd mm0,mm1
- psrad mm7,DESCALE_P1
- psrad mm0,DESCALE_P1
-
- packssdw mm4,mm3 ; mm4=data1=(10 11 12 13)
- packssdw mm7,mm0 ; mm7=data6=(60 61 62 63)
-
- movq mm6,mm5 ; transpose coefficients(phase 1)
- punpcklwd mm5,mm4 ; mm5=(00 10 01 11)
- punpckhwd mm6,mm4 ; mm6=(02 12 03 13)
- movq mm1,mm7 ; transpose coefficients(phase 1)
- punpcklwd mm7,mm2 ; mm7=(60 70 61 71)
- punpckhwd mm1,mm2 ; mm1=(62 72 63 73)
-
- movq mm3, MMWORD [wk(6)] ; mm3=tmp12L
- movq mm0, MMWORD [wk(7)] ; mm0=tmp12H
- movq mm4, MMWORD [wk(10)] ; mm4=tmp1L
- movq mm2, MMWORD [wk(11)] ; mm2=tmp1H
-
- movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11)
- movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13)
- movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71)
- movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73)
-
- movq mm5,mm3
- movq mm6,mm0
- paddd mm3,mm4 ; mm3=data2L
- paddd mm0,mm2 ; mm0=data2H
- psubd mm5,mm4 ; mm5=data5L
- psubd mm6,mm2 ; mm6=data5H
-
- movq mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
-
- paddd mm3,mm7
- paddd mm0,mm7
- psrad mm3,DESCALE_P1
- psrad mm0,DESCALE_P1
- paddd mm5,mm7
- paddd mm6,mm7
- psrad mm5,DESCALE_P1
- psrad mm6,DESCALE_P1
-
- packssdw mm3,mm0 ; mm3=data2=(20 21 22 23)
- packssdw mm5,mm6 ; mm5=data5=(50 51 52 53)
-
- movq mm1, MMWORD [wk(2)] ; mm1=tmp13L
- movq mm4, MMWORD [wk(3)] ; mm4=tmp13H
- movq mm2, MMWORD [wk(8)] ; mm2=tmp0L
- movq mm7, MMWORD [wk(9)] ; mm7=tmp0H
-
- movq mm0,mm1
- movq mm6,mm4
- paddd mm1,mm2 ; mm1=data3L
- paddd mm4,mm7 ; mm4=data3H
- psubd mm0,mm2 ; mm0=data4L
- psubd mm6,mm7 ; mm6=data4H
-
- movq mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
-
- paddd mm1,mm2
- paddd mm4,mm2
- psrad mm1,DESCALE_P1
- psrad mm4,DESCALE_P1
- paddd mm0,mm2
- paddd mm6,mm2
- psrad mm0,DESCALE_P1
- psrad mm6,DESCALE_P1
-
- packssdw mm1,mm4 ; mm1=data3=(30 31 32 33)
- packssdw mm0,mm6 ; mm0=data4=(40 41 42 43)
-
- movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11)
- movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13)
-
- movq mm4,mm3 ; transpose coefficients(phase 1)
- punpcklwd mm3,mm1 ; mm3=(20 30 21 31)
- punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
- movq mm6,mm0 ; transpose coefficients(phase 1)
- punpcklwd mm0,mm5 ; mm0=(40 50 41 51)
- punpckhwd mm6,mm5 ; mm6=(42 52 43 53)
-
- movq mm1,mm7 ; transpose coefficients(phase 2)
- punpckldq mm7,mm3 ; mm7=(00 10 20 30)
- punpckhdq mm1,mm3 ; mm1=(01 11 21 31)
- movq mm5,mm2 ; transpose coefficients(phase 2)
- punpckldq mm2,mm4 ; mm2=(02 12 22 32)
- punpckhdq mm5,mm4 ; mm5=(03 13 23 33)
-
- movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71)
- movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73)
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
- movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
- movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
- movq mm7,mm0 ; transpose coefficients(phase 2)
- punpckldq mm0,mm3 ; mm0=(40 50 60 70)
- punpckhdq mm7,mm3 ; mm7=(41 51 61 71)
- movq mm1,mm6 ; transpose coefficients(phase 2)
- punpckldq mm6,mm4 ; mm6=(42 52 62 72)
- punpckhdq mm1,mm4 ; mm1=(43 53 63 73)
-
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
- movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
- movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
-
-.nextcolumn:
- add esi, byte 4*SIZEOF_JCOEF ; coef_block
- add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
- add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
- dec ecx ; ctr
- jnz near .columnloop
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- lea esi, [workspace] ; JCOEF *wsptr
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
-.rowloop:
-
- ; -- Even part
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movq mm4,mm1 ; mm1=in2=z2
- movq mm5,mm1
- punpcklwd mm4,mm3 ; mm3=in6=z3
- punpckhwd mm5,mm3
- movq mm1,mm4
- movq mm3,mm5
- pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
- pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
- pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
- pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
-
- movq mm6,mm0
- paddw mm0,mm2 ; mm0=in0+in4
- psubw mm6,mm2 ; mm6=in0-in4
-
- pxor mm7,mm7
- pxor mm2,mm2
- punpcklwd mm7,mm0 ; mm7=tmp0L
- punpckhwd mm2,mm0 ; mm2=tmp0H
- psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
- psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
-
- movq mm0,mm7
- paddd mm7,mm4 ; mm7=tmp10L
- psubd mm0,mm4 ; mm0=tmp13L
- movq mm4,mm2
- paddd mm2,mm5 ; mm2=tmp10H
- psubd mm4,mm5 ; mm4=tmp13H
-
- movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
- movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
- movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
- movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
-
- pxor mm5,mm5
- pxor mm7,mm7
- punpcklwd mm5,mm6 ; mm5=tmp1L
- punpckhwd mm7,mm6 ; mm7=tmp1H
- psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
- psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
-
- movq mm2,mm5
- paddd mm5,mm1 ; mm5=tmp11L
- psubd mm2,mm1 ; mm2=tmp12L
- movq mm0,mm7
- paddd mm7,mm3 ; mm7=tmp11H
- psubd mm0,mm3 ; mm0=tmp12H
-
- movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
- movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
- movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
- movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
- movq mm5,mm6
- movq mm7,mm4
- paddw mm5,mm3 ; mm5=z3
- paddw mm7,mm1 ; mm7=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movq mm2,mm5
- movq mm0,mm5
- punpcklwd mm2,mm7
- punpckhwd mm0,mm7
- movq mm5,mm2
- movq mm7,mm0
- pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
- pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
- pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
- pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
-
- movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
- movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movq mm2,mm3
- movq mm0,mm3
- punpcklwd mm2,mm4
- punpckhwd mm0,mm4
- movq mm3,mm2
- movq mm4,mm0
- pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
- pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
- pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
- pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
-
- paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
- paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
- paddd mm3,mm5 ; mm3=tmp3L
- paddd mm4,mm7 ; mm4=tmp3H
-
- movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
- movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
-
- movq mm2,mm1
- movq mm0,mm1
- punpcklwd mm2,mm6
- punpckhwd mm0,mm6
- movq mm1,mm2
- movq mm6,mm0
- pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
- pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
- pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
- pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
-
- paddd mm2,mm5 ; mm2=tmp1L
- paddd mm0,mm7 ; mm0=tmp1H
- paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
- paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
-
- movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
- movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
- movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
-
- movq mm2,mm5
- movq mm0,mm7
- paddd mm5,mm3 ; mm5=data0L
- paddd mm7,mm4 ; mm7=data0H
- psubd mm2,mm3 ; mm2=data7L
- psubd mm0,mm4 ; mm0=data7H
-
- movq mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
-
- paddd mm5,mm3
- paddd mm7,mm3
- psrad mm5,DESCALE_P2
- psrad mm7,DESCALE_P2
- paddd mm2,mm3
- paddd mm0,mm3
- psrad mm2,DESCALE_P2
- psrad mm0,DESCALE_P2
-
- packssdw mm5,mm7 ; mm5=data0=(00 10 20 30)
- packssdw mm2,mm0 ; mm2=data7=(07 17 27 37)
-
- movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
- movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
-
- movq mm7,mm4
- movq mm0,mm3
- paddd mm4,mm1 ; mm4=data1L
- paddd mm3,mm6 ; mm3=data1H
- psubd mm7,mm1 ; mm7=data6L
- psubd mm0,mm6 ; mm0=data6H
-
- movq mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
-
- paddd mm4,mm1
- paddd mm3,mm1
- psrad mm4,DESCALE_P2
- psrad mm3,DESCALE_P2
- paddd mm7,mm1
- paddd mm0,mm1
- psrad mm7,DESCALE_P2
- psrad mm0,DESCALE_P2
-
- packssdw mm4,mm3 ; mm4=data1=(01 11 21 31)
- packssdw mm7,mm0 ; mm7=data6=(06 16 26 36)
-
- packsswb mm5,mm7 ; mm5=(00 10 20 30 06 16 26 36)
- packsswb mm4,mm2 ; mm4=(01 11 21 31 07 17 27 37)
-
- movq mm6, MMWORD [wk(6)] ; mm6=tmp12L
- movq mm1, MMWORD [wk(7)] ; mm1=tmp12H
- movq mm3, MMWORD [wk(10)] ; mm3=tmp1L
- movq mm0, MMWORD [wk(11)] ; mm0=tmp1H
-
- movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36)
- movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37)
-
- movq mm7,mm6
- movq mm2,mm1
- paddd mm6,mm3 ; mm6=data2L
- paddd mm1,mm0 ; mm1=data2H
- psubd mm7,mm3 ; mm7=data5L
- psubd mm2,mm0 ; mm2=data5H
-
- movq mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
-
- paddd mm6,mm5
- paddd mm1,mm5
- psrad mm6,DESCALE_P2
- psrad mm1,DESCALE_P2
- paddd mm7,mm5
- paddd mm2,mm5
- psrad mm7,DESCALE_P2
- psrad mm2,DESCALE_P2
-
- packssdw mm6,mm1 ; mm6=data2=(02 12 22 32)
- packssdw mm7,mm2 ; mm7=data5=(05 15 25 35)
-
- movq mm4, MMWORD [wk(2)] ; mm4=tmp13L
- movq mm3, MMWORD [wk(3)] ; mm3=tmp13H
- movq mm0, MMWORD [wk(8)] ; mm0=tmp0L
- movq mm5, MMWORD [wk(9)] ; mm5=tmp0H
-
- movq mm1,mm4
- movq mm2,mm3
- paddd mm4,mm0 ; mm4=data3L
- paddd mm3,mm5 ; mm3=data3H
- psubd mm1,mm0 ; mm1=data4L
- psubd mm2,mm5 ; mm2=data4H
-
- movq mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
-
- paddd mm4,mm0
- paddd mm3,mm0
- psrad mm4,DESCALE_P2
- psrad mm3,DESCALE_P2
- paddd mm1,mm0
- paddd mm2,mm0
- psrad mm1,DESCALE_P2
- psrad mm2,DESCALE_P2
-
- movq mm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP]
-
- packssdw mm4,mm3 ; mm4=data3=(03 13 23 33)
- packssdw mm1,mm2 ; mm1=data4=(04 14 24 34)
-
- movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36)
- movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37)
-
- packsswb mm6,mm1 ; mm6=(02 12 22 32 04 14 24 34)
- packsswb mm4,mm7 ; mm4=(03 13 23 33 05 15 25 35)
-
- paddb mm0,mm5
- paddb mm3,mm5
- paddb mm6,mm5
- paddb mm4,mm5
-
- movq mm2,mm0 ; transpose coefficients(phase 1)
- punpcklbw mm0,mm3 ; mm0=(00 01 10 11 20 21 30 31)
- punpckhbw mm2,mm3 ; mm2=(06 07 16 17 26 27 36 37)
- movq mm1,mm6 ; transpose coefficients(phase 1)
- punpcklbw mm6,mm4 ; mm6=(02 03 12 13 22 23 32 33)
- punpckhbw mm1,mm4 ; mm1=(04 05 14 15 24 25 34 35)
-
- movq mm7,mm0 ; transpose coefficients(phase 2)
- punpcklwd mm0,mm6 ; mm0=(00 01 02 03 10 11 12 13)
- punpckhwd mm7,mm6 ; mm7=(20 21 22 23 30 31 32 33)
- movq mm5,mm1 ; transpose coefficients(phase 2)
- punpcklwd mm1,mm2 ; mm1=(04 05 06 07 14 15 16 17)
- punpckhwd mm5,mm2 ; mm5=(24 25 26 27 34 35 36 37)
-
- movq mm3,mm0 ; transpose coefficients(phase 3)
- punpckldq mm0,mm1 ; mm0=(00 01 02 03 04 05 06 07)
- punpckhdq mm3,mm1 ; mm3=(10 11 12 13 14 15 16 17)
- movq mm4,mm7 ; transpose coefficients(phase 3)
- punpckldq mm7,mm5 ; mm7=(20 21 22 23 24 25 26 27)
- punpckhdq mm4,mm5 ; mm4=(30 31 32 33 34 35 36 37)
-
- pushpic ebx ; save GOT address
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
- movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
- mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
- movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
- poppic ebx ; restore GOT address
-
- add esi, byte 4*SIZEOF_JCOEF ; wsptr
- add edi, byte 4*SIZEOF_JSAMPROW
- dec ecx ; ctr
- jnz near .rowloop
-
- emms ; empty MMX state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctint-sse2-64.asm b/media/libjpeg/simd/jidctint-sse2-64.asm
deleted file mode 100644
index afe1d6a73b..0000000000
--- a/media/libjpeg/simd/jidctint-sse2-64.asm
+++ /dev/null
@@ -1,847 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 13
-%define PASS1_BITS 2
-
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_islow_sse2)
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info *compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp rbp+0
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 12
-
- align 16
- global EXTN(jsimd_idct_islow_sse2)
-
-EXTN(jsimd_idct_islow_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
-
- ; ---- Pass 1: process columns from input.
-
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- por xmm1,xmm0
- packsswb xmm1,xmm1
- packsswb xmm1,xmm1
- movd eax,xmm1
- test rax,rax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw xmm5,PASS1_BITS
-
- movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
- punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
-
- pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
- pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
- pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
- pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
- pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
- pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
- pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
- pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
- movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
- movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
- movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
- movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
- jmp near .column_end
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movdqa xmm4,xmm1 ; xmm1=in2=z2
- movdqa xmm5,xmm1
- punpcklwd xmm4,xmm3 ; xmm3=in6=z3
- punpckhwd xmm5,xmm3
- movdqa xmm1,xmm4
- movdqa xmm3,xmm5
- pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L
- pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H
- pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L
- pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H
-
- movdqa xmm6,xmm0
- paddw xmm0,xmm2 ; xmm0=in0+in4
- psubw xmm6,xmm2 ; xmm6=in0-in4
-
- pxor xmm7,xmm7
- pxor xmm2,xmm2
- punpcklwd xmm7,xmm0 ; xmm7=tmp0L
- punpckhwd xmm2,xmm0 ; xmm2=tmp0H
- psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
- psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
- movdqa xmm0,xmm7
- paddd xmm7,xmm4 ; xmm7=tmp10L
- psubd xmm0,xmm4 ; xmm0=tmp13L
- movdqa xmm4,xmm2
- paddd xmm2,xmm5 ; xmm2=tmp10H
- psubd xmm4,xmm5 ; xmm4=tmp13H
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
- movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
- movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
- movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
-
- pxor xmm5,xmm5
- pxor xmm7,xmm7
- punpcklwd xmm5,xmm6 ; xmm5=tmp1L
- punpckhwd xmm7,xmm6 ; xmm7=tmp1H
- psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
- psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
- movdqa xmm2,xmm5
- paddd xmm5,xmm1 ; xmm5=tmp11L
- psubd xmm2,xmm1 ; xmm2=tmp12L
- movdqa xmm0,xmm7
- paddd xmm7,xmm3 ; xmm7=tmp11H
- psubd xmm0,xmm3 ; xmm0=tmp12H
-
- movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
- movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
- movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
- movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm5,xmm6
- movdqa xmm7,xmm4
- paddw xmm5,xmm3 ; xmm5=z3
- paddw xmm7,xmm1 ; xmm7=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm2,xmm5
- movdqa xmm0,xmm5
- punpcklwd xmm2,xmm7
- punpckhwd xmm0,xmm7
- movdqa xmm5,xmm2
- movdqa xmm7,xmm0
- pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L
- pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H
- pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L
- pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H
-
- movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
- movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movdqa xmm2,xmm3
- movdqa xmm0,xmm3
- punpcklwd xmm2,xmm4
- punpckhwd xmm0,xmm4
- movdqa xmm3,xmm2
- movdqa xmm4,xmm0
- pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L
- pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H
- pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L
- pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H
-
- paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
- paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
- paddd xmm3,xmm5 ; xmm3=tmp3L
- paddd xmm4,xmm7 ; xmm4=tmp3H
-
- movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
- movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
-
- movdqa xmm2,xmm1
- movdqa xmm0,xmm1
- punpcklwd xmm2,xmm6
- punpckhwd xmm0,xmm6
- movdqa xmm1,xmm2
- movdqa xmm6,xmm0
- pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L
- pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H
- pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L
- pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H
-
- paddd xmm2,xmm5 ; xmm2=tmp1L
- paddd xmm0,xmm7 ; xmm0=tmp1H
- paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
- paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
-
- movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
- movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
- movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
-
- movdqa xmm2,xmm5
- movdqa xmm0,xmm7
- paddd xmm5,xmm3 ; xmm5=data0L
- paddd xmm7,xmm4 ; xmm7=data0H
- psubd xmm2,xmm3 ; xmm2=data7L
- psubd xmm0,xmm4 ; xmm0=data7H
-
- movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
-
- paddd xmm5,xmm3
- paddd xmm7,xmm3
- psrad xmm5,DESCALE_P1
- psrad xmm7,DESCALE_P1
- paddd xmm2,xmm3
- paddd xmm0,xmm3
- psrad xmm2,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
- packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
- movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
- movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
-
- movdqa xmm7,xmm4
- movdqa xmm0,xmm3
- paddd xmm4,xmm1 ; xmm4=data1L
- paddd xmm3,xmm6 ; xmm3=data1H
- psubd xmm7,xmm1 ; xmm7=data6L
- psubd xmm0,xmm6 ; xmm0=data6H
-
- movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
-
- paddd xmm4,xmm1
- paddd xmm3,xmm1
- psrad xmm4,DESCALE_P1
- psrad xmm3,DESCALE_P1
- paddd xmm7,xmm1
- paddd xmm0,xmm1
- psrad xmm7,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
- packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
- movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
- punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
- movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
- punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
-
- movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
- movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
- movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
- movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
- movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
- movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
-
- movdqa xmm5,xmm3
- movdqa xmm6,xmm0
- paddd xmm3,xmm4 ; xmm3=data2L
- paddd xmm0,xmm2 ; xmm0=data2H
- psubd xmm5,xmm4 ; xmm5=data5L
- psubd xmm6,xmm2 ; xmm6=data5H
-
- movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
-
- paddd xmm3,xmm7
- paddd xmm0,xmm7
- psrad xmm3,DESCALE_P1
- psrad xmm0,DESCALE_P1
- paddd xmm5,xmm7
- paddd xmm6,xmm7
- psrad xmm5,DESCALE_P1
- psrad xmm6,DESCALE_P1
-
- packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
- packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
- movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
- movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
- movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
- movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
-
- movdqa xmm0,xmm1
- movdqa xmm6,xmm4
- paddd xmm1,xmm2 ; xmm1=data3L
- paddd xmm4,xmm7 ; xmm4=data3H
- psubd xmm0,xmm2 ; xmm0=data4L
- psubd xmm6,xmm7 ; xmm6=data4H
-
- movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
-
- paddd xmm1,xmm2
- paddd xmm4,xmm2
- psrad xmm1,DESCALE_P1
- psrad xmm4,DESCALE_P1
- paddd xmm0,xmm2
- paddd xmm6,xmm2
- psrad xmm0,DESCALE_P1
- psrad xmm6,DESCALE_P1
-
- packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
- packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
- movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
-
- movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
- punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
- punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
- movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
- punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
- punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
- punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
- punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
-
- movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
- movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
-
- movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
- movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
-
- movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
- punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
- punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
-
- movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
- punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
- movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
- punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
- movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
- movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
- movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
-
- movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
- punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
- movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
- punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
- movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
-.column_end:
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov rax, [original_rbp]
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
-
- ; -- Even part
-
- ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movdqa xmm6,xmm1 ; xmm1=in2=z2
- movdqa xmm5,xmm1
- punpcklwd xmm6,xmm2 ; xmm2=in6=z3
- punpckhwd xmm5,xmm2
- movdqa xmm1,xmm6
- movdqa xmm2,xmm5
- pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L
- pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H
- pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L
- pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H
-
- movdqa xmm3,xmm7
- paddw xmm7,xmm0 ; xmm7=in0+in4
- psubw xmm3,xmm0 ; xmm3=in0-in4
-
- pxor xmm4,xmm4
- pxor xmm0,xmm0
- punpcklwd xmm4,xmm7 ; xmm4=tmp0L
- punpckhwd xmm0,xmm7 ; xmm0=tmp0H
- psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
- psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
- movdqa xmm7,xmm4
- paddd xmm4,xmm6 ; xmm4=tmp10L
- psubd xmm7,xmm6 ; xmm7=tmp13L
- movdqa xmm6,xmm0
- paddd xmm0,xmm5 ; xmm0=tmp10H
- psubd xmm6,xmm5 ; xmm6=tmp13H
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
- movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
- movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
-
- pxor xmm5,xmm5
- pxor xmm4,xmm4
- punpcklwd xmm5,xmm3 ; xmm5=tmp1L
- punpckhwd xmm4,xmm3 ; xmm4=tmp1H
- psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
- psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
- movdqa xmm0,xmm5
- paddd xmm5,xmm1 ; xmm5=tmp11L
- psubd xmm0,xmm1 ; xmm0=tmp12L
- movdqa xmm7,xmm4
- paddd xmm4,xmm2 ; xmm4=tmp11H
- psubd xmm7,xmm2 ; xmm7=tmp12H
-
- movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
- movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
- movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
- movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
- movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
- movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
- movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
-
- movdqa xmm5,xmm6
- movdqa xmm4,xmm3
- paddw xmm5,xmm1 ; xmm5=z3
- paddw xmm4,xmm2 ; xmm4=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm0,xmm5
- movdqa xmm7,xmm5
- punpcklwd xmm0,xmm4
- punpckhwd xmm7,xmm4
- movdqa xmm5,xmm0
- movdqa xmm4,xmm7
- pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L
- pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H
- pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L
- pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H
-
- movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
- movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movdqa xmm0,xmm1
- movdqa xmm7,xmm1
- punpcklwd xmm0,xmm3
- punpckhwd xmm7,xmm3
- movdqa xmm1,xmm0
- movdqa xmm3,xmm7
- pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L
- pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H
- pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L
- pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H
-
- paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
- paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
- paddd xmm1,xmm5 ; xmm1=tmp3L
- paddd xmm3,xmm4 ; xmm3=tmp3H
-
- movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
- movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
-
- movdqa xmm0,xmm2
- movdqa xmm7,xmm2
- punpcklwd xmm0,xmm6
- punpckhwd xmm7,xmm6
- movdqa xmm2,xmm0
- movdqa xmm6,xmm7
- pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L
- pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H
- pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L
- pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H
-
- paddd xmm0,xmm5 ; xmm0=tmp1L
- paddd xmm7,xmm4 ; xmm7=tmp1H
- paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
- paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
-
- movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
- movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
- movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
-
- movdqa xmm0,xmm5
- movdqa xmm7,xmm4
- paddd xmm5,xmm1 ; xmm5=data0L
- paddd xmm4,xmm3 ; xmm4=data0H
- psubd xmm0,xmm1 ; xmm0=data7L
- psubd xmm7,xmm3 ; xmm7=data7H
-
- movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
-
- paddd xmm5,xmm1
- paddd xmm4,xmm1
- psrad xmm5,DESCALE_P2
- psrad xmm4,DESCALE_P2
- paddd xmm0,xmm1
- paddd xmm7,xmm1
- psrad xmm0,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
- packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
- movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
- movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
-
- movdqa xmm4,xmm3
- movdqa xmm7,xmm1
- paddd xmm3,xmm2 ; xmm3=data1L
- paddd xmm1,xmm6 ; xmm1=data1H
- psubd xmm4,xmm2 ; xmm4=data6L
- psubd xmm7,xmm6 ; xmm7=data6H
-
- movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
-
- paddd xmm3,xmm2
- paddd xmm1,xmm2
- psrad xmm3,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm4,xmm2
- paddd xmm7,xmm2
- psrad xmm4,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
- packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
- packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
- movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
- movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
- movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm4,xmm6
- movdqa xmm0,xmm2
- paddd xmm6,xmm1 ; xmm6=data2L
- paddd xmm2,xmm7 ; xmm2=data2H
- psubd xmm4,xmm1 ; xmm4=data5L
- psubd xmm0,xmm7 ; xmm0=data5H
-
- movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
-
- paddd xmm6,xmm5
- paddd xmm2,xmm5
- psrad xmm6,DESCALE_P2
- psrad xmm2,DESCALE_P2
- paddd xmm4,xmm5
- paddd xmm0,xmm5
- psrad xmm4,DESCALE_P2
- psrad xmm0,DESCALE_P2
-
- packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
- packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
- movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
- movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
- movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
- movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
-
- movdqa xmm2,xmm3
- movdqa xmm0,xmm1
- paddd xmm3,xmm7 ; xmm3=data3L
- paddd xmm1,xmm5 ; xmm1=data3H
- psubd xmm2,xmm7 ; xmm2=data4L
- psubd xmm0,xmm5 ; xmm0=data4H
-
- movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
-
- paddd xmm3,xmm7
- paddd xmm1,xmm7
- psrad xmm3,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm2,xmm7
- paddd xmm0,xmm7
- psrad xmm2,DESCALE_P2
- psrad xmm0,DESCALE_P2
-
- movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
-
- packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
- packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
- packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
- paddb xmm7,xmm5
- paddb xmm1,xmm5
- paddb xmm6,xmm5
- paddb xmm3,xmm5
-
- movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
- punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
- punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
- punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
- movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
- punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
- punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
- punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
- movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
- punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
- punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
- pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
- pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
- pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
- mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
- mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
- mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctint-sse2.asm b/media/libjpeg/simd/jidctint-sse2.asm
deleted file mode 100644
index 6c7e7d9b4f..0000000000
--- a/media/libjpeg/simd/jidctint-sse2.asm
+++ /dev/null
@@ -1,858 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 13
-%define PASS1_BITS 2
-
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_islow_sse2)
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 12
-
- align 16
- global EXTN(jsimd_idct_islow_sse2)
-
-EXTN(jsimd_idct_islow_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por xmm1,xmm0
- packsswb xmm1,xmm1
- packsswb xmm1,xmm1
- movd eax,xmm1
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw xmm5,PASS1_BITS
-
- movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
- punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
-
- pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
- pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
- pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
- pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
- pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
- pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
- pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
- pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
- movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
- movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
- movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
- movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
- jmp near .column_end
- alignx 16,7
-%endif
-.columnDCT:
-
- ; -- Even part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movdqa xmm4,xmm1 ; xmm1=in2=z2
- movdqa xmm5,xmm1
- punpcklwd xmm4,xmm3 ; xmm3=in6=z3
- punpckhwd xmm5,xmm3
- movdqa xmm1,xmm4
- movdqa xmm3,xmm5
- pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
- pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
- pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
- pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
-
- movdqa xmm6,xmm0
- paddw xmm0,xmm2 ; xmm0=in0+in4
- psubw xmm6,xmm2 ; xmm6=in0-in4
-
- pxor xmm7,xmm7
- pxor xmm2,xmm2
- punpcklwd xmm7,xmm0 ; xmm7=tmp0L
- punpckhwd xmm2,xmm0 ; xmm2=tmp0H
- psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
- psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
- movdqa xmm0,xmm7
- paddd xmm7,xmm4 ; xmm7=tmp10L
- psubd xmm0,xmm4 ; xmm0=tmp13L
- movdqa xmm4,xmm2
- paddd xmm2,xmm5 ; xmm2=tmp10H
- psubd xmm4,xmm5 ; xmm4=tmp13H
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
- movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
- movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
- movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
-
- pxor xmm5,xmm5
- pxor xmm7,xmm7
- punpcklwd xmm5,xmm6 ; xmm5=tmp1L
- punpckhwd xmm7,xmm6 ; xmm7=tmp1H
- psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
- psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
- movdqa xmm2,xmm5
- paddd xmm5,xmm1 ; xmm5=tmp11L
- psubd xmm2,xmm1 ; xmm2=tmp12L
- movdqa xmm0,xmm7
- paddd xmm7,xmm3 ; xmm7=tmp11H
- psubd xmm0,xmm3 ; xmm0=tmp12H
-
- movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
- movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
- movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
- movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm5,xmm6
- movdqa xmm7,xmm4
- paddw xmm5,xmm3 ; xmm5=z3
- paddw xmm7,xmm1 ; xmm7=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm2,xmm5
- movdqa xmm0,xmm5
- punpcklwd xmm2,xmm7
- punpckhwd xmm0,xmm7
- movdqa xmm5,xmm2
- movdqa xmm7,xmm0
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
- pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
- pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
-
- movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
- movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movdqa xmm2,xmm3
- movdqa xmm0,xmm3
- punpcklwd xmm2,xmm4
- punpckhwd xmm0,xmm4
- movdqa xmm3,xmm2
- movdqa xmm4,xmm0
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
- pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
-
- paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
- paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
- paddd xmm3,xmm5 ; xmm3=tmp3L
- paddd xmm4,xmm7 ; xmm4=tmp3H
-
- movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
- movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
-
- movdqa xmm2,xmm1
- movdqa xmm0,xmm1
- punpcklwd xmm2,xmm6
- punpckhwd xmm0,xmm6
- movdqa xmm1,xmm2
- movdqa xmm6,xmm0
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
- pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
-
- paddd xmm2,xmm5 ; xmm2=tmp1L
- paddd xmm0,xmm7 ; xmm0=tmp1H
- paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
- paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
-
- movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
- movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
- movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
-
- movdqa xmm2,xmm5
- movdqa xmm0,xmm7
- paddd xmm5,xmm3 ; xmm5=data0L
- paddd xmm7,xmm4 ; xmm7=data0H
- psubd xmm2,xmm3 ; xmm2=data7L
- psubd xmm0,xmm4 ; xmm0=data7H
-
- movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
-
- paddd xmm5,xmm3
- paddd xmm7,xmm3
- psrad xmm5,DESCALE_P1
- psrad xmm7,DESCALE_P1
- paddd xmm2,xmm3
- paddd xmm0,xmm3
- psrad xmm2,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
- packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
- movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
- movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
-
- movdqa xmm7,xmm4
- movdqa xmm0,xmm3
- paddd xmm4,xmm1 ; xmm4=data1L
- paddd xmm3,xmm6 ; xmm3=data1H
- psubd xmm7,xmm1 ; xmm7=data6L
- psubd xmm0,xmm6 ; xmm0=data6H
-
- movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
-
- paddd xmm4,xmm1
- paddd xmm3,xmm1
- psrad xmm4,DESCALE_P1
- psrad xmm3,DESCALE_P1
- paddd xmm7,xmm1
- paddd xmm0,xmm1
- psrad xmm7,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
- packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
- movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
- punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
- movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
- punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
-
- movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
- movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
- movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
- movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
- movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
- movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
-
- movdqa xmm5,xmm3
- movdqa xmm6,xmm0
- paddd xmm3,xmm4 ; xmm3=data2L
- paddd xmm0,xmm2 ; xmm0=data2H
- psubd xmm5,xmm4 ; xmm5=data5L
- psubd xmm6,xmm2 ; xmm6=data5H
-
- movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
-
- paddd xmm3,xmm7
- paddd xmm0,xmm7
- psrad xmm3,DESCALE_P1
- psrad xmm0,DESCALE_P1
- paddd xmm5,xmm7
- paddd xmm6,xmm7
- psrad xmm5,DESCALE_P1
- psrad xmm6,DESCALE_P1
-
- packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
- packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
- movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
- movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
- movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
- movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
-
- movdqa xmm0,xmm1
- movdqa xmm6,xmm4
- paddd xmm1,xmm2 ; xmm1=data3L
- paddd xmm4,xmm7 ; xmm4=data3H
- psubd xmm0,xmm2 ; xmm0=data4L
- psubd xmm6,xmm7 ; xmm6=data4H
-
- movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
-
- paddd xmm1,xmm2
- paddd xmm4,xmm2
- psrad xmm1,DESCALE_P1
- psrad xmm4,DESCALE_P1
- paddd xmm0,xmm2
- paddd xmm6,xmm2
- psrad xmm0,DESCALE_P1
- psrad xmm6,DESCALE_P1
-
- packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
- packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
- movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
-
- movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
- punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
- punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
- movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
- punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
- punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
- punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
- punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
-
- movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
- movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
-
- movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
- movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
-
- movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
- punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
- punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
-
- movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
- punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
- movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
- punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
- movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
- movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
- movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
-
- movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
- punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
- movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
- punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
- movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
-.column_end:
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
-
- ; -- Even part
-
- ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movdqa xmm6,xmm1 ; xmm1=in2=z2
- movdqa xmm5,xmm1
- punpcklwd xmm6,xmm2 ; xmm2=in6=z3
- punpckhwd xmm5,xmm2
- movdqa xmm1,xmm6
- movdqa xmm2,xmm5
- pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
- pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
- pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
- pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
-
- movdqa xmm3,xmm7
- paddw xmm7,xmm0 ; xmm7=in0+in4
- psubw xmm3,xmm0 ; xmm3=in0-in4
-
- pxor xmm4,xmm4
- pxor xmm0,xmm0
- punpcklwd xmm4,xmm7 ; xmm4=tmp0L
- punpckhwd xmm0,xmm7 ; xmm0=tmp0H
- psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
- psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
- movdqa xmm7,xmm4
- paddd xmm4,xmm6 ; xmm4=tmp10L
- psubd xmm7,xmm6 ; xmm7=tmp13L
- movdqa xmm6,xmm0
- paddd xmm0,xmm5 ; xmm0=tmp10H
- psubd xmm6,xmm5 ; xmm6=tmp13H
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
- movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
- movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
-
- pxor xmm5,xmm5
- pxor xmm4,xmm4
- punpcklwd xmm5,xmm3 ; xmm5=tmp1L
- punpckhwd xmm4,xmm3 ; xmm4=tmp1H
- psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
- psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
- movdqa xmm0,xmm5
- paddd xmm5,xmm1 ; xmm5=tmp11L
- psubd xmm0,xmm1 ; xmm0=tmp12L
- movdqa xmm7,xmm4
- paddd xmm4,xmm2 ; xmm4=tmp11H
- psubd xmm7,xmm2 ; xmm7=tmp12H
-
- movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
- movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
- movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
- movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
- movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
- movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
- movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
-
- movdqa xmm5,xmm6
- movdqa xmm4,xmm3
- paddw xmm5,xmm1 ; xmm5=z3
- paddw xmm4,xmm2 ; xmm4=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm0,xmm5
- movdqa xmm7,xmm5
- punpcklwd xmm0,xmm4
- punpckhwd xmm7,xmm4
- movdqa xmm5,xmm0
- movdqa xmm4,xmm7
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
- pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
- pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
-
- movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
- movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movdqa xmm0,xmm1
- movdqa xmm7,xmm1
- punpcklwd xmm0,xmm3
- punpckhwd xmm7,xmm3
- movdqa xmm1,xmm0
- movdqa xmm3,xmm7
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
- pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
-
- paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
- paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
- paddd xmm1,xmm5 ; xmm1=tmp3L
- paddd xmm3,xmm4 ; xmm3=tmp3H
-
- movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
- movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
-
- movdqa xmm0,xmm2
- movdqa xmm7,xmm2
- punpcklwd xmm0,xmm6
- punpckhwd xmm7,xmm6
- movdqa xmm2,xmm0
- movdqa xmm6,xmm7
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
- pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
-
- paddd xmm0,xmm5 ; xmm0=tmp1L
- paddd xmm7,xmm4 ; xmm7=tmp1H
- paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
- paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
-
- movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
- movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
- movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
-
- movdqa xmm0,xmm5
- movdqa xmm7,xmm4
- paddd xmm5,xmm1 ; xmm5=data0L
- paddd xmm4,xmm3 ; xmm4=data0H
- psubd xmm0,xmm1 ; xmm0=data7L
- psubd xmm7,xmm3 ; xmm7=data7H
-
- movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
-
- paddd xmm5,xmm1
- paddd xmm4,xmm1
- psrad xmm5,DESCALE_P2
- psrad xmm4,DESCALE_P2
- paddd xmm0,xmm1
- paddd xmm7,xmm1
- psrad xmm0,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
- packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
- movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
- movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
-
- movdqa xmm4,xmm3
- movdqa xmm7,xmm1
- paddd xmm3,xmm2 ; xmm3=data1L
- paddd xmm1,xmm6 ; xmm1=data1H
- psubd xmm4,xmm2 ; xmm4=data6L
- psubd xmm7,xmm6 ; xmm7=data6H
-
- movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
-
- paddd xmm3,xmm2
- paddd xmm1,xmm2
- psrad xmm3,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm4,xmm2
- paddd xmm7,xmm2
- psrad xmm4,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
- packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
- packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
- movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
- movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
- movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm4,xmm6
- movdqa xmm0,xmm2
- paddd xmm6,xmm1 ; xmm6=data2L
- paddd xmm2,xmm7 ; xmm2=data2H
- psubd xmm4,xmm1 ; xmm4=data5L
- psubd xmm0,xmm7 ; xmm0=data5H
-
- movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
-
- paddd xmm6,xmm5
- paddd xmm2,xmm5
- psrad xmm6,DESCALE_P2
- psrad xmm2,DESCALE_P2
- paddd xmm4,xmm5
- paddd xmm0,xmm5
- psrad xmm4,DESCALE_P2
- psrad xmm0,DESCALE_P2
-
- packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
- packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
- movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
- movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
- movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
- movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
-
- movdqa xmm2,xmm3
- movdqa xmm0,xmm1
- paddd xmm3,xmm7 ; xmm3=data3L
- paddd xmm1,xmm5 ; xmm1=data3H
- psubd xmm2,xmm7 ; xmm2=data4L
- psubd xmm0,xmm5 ; xmm0=data4H
-
- movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
-
- paddd xmm3,xmm7
- paddd xmm1,xmm7
- psrad xmm3,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm2,xmm7
- paddd xmm0,xmm7
- psrad xmm2,DESCALE_P2
- psrad xmm0,DESCALE_P2
-
- movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
-
- packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
- packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
- packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
- paddb xmm7,xmm5
- paddb xmm1,xmm5
- paddb xmm6,xmm5
- paddb xmm3,xmm5
-
- movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
- punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
- punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
- punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
- movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
- punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
- punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
- punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
- movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
- punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
- punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
- pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
- pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
- pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
- mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
- mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
- mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctred-mmx.asm b/media/libjpeg/simd/jidctred-mmx.asm
deleted file mode 100644
index ba054e31a0..0000000000
--- a/media/libjpeg/simd/jidctred-mmx.asm
+++ /dev/null
@@ -1,705 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 13
-%define PASS1_BITS 2
-
-%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ 1730 ; FIX(0.211164243)
-F_0_509 equ 4176 ; FIX(0.509795579)
-F_0_601 equ 4926 ; FIX(0.601344887)
-F_0_720 equ 5906 ; FIX(0.720959822)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_850 equ 6967 ; FIX(0.850430095)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_061 equ 8697 ; FIX(1.061594337)
-F_1_272 equ 10426 ; FIX(1.272758580)
-F_1_451 equ 11893 ; FIX(1.451774981)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_2_172 equ 17799 ; FIX(2.172734803)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_624 equ 29692 ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
-F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
-F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
-F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
-F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
-F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_red_mmx)
-
-EXTN(jconst_idct_red_mmx):
-
-PW_F184_MF076 times 2 dw F_1_847,-F_0_765
-PW_F256_F089 times 2 dw F_2_562, F_0_899
-PW_F106_MF217 times 2 dw F_1_061,-F_2_172
-PW_MF060_MF050 times 2 dw -F_0_601,-F_0_509
-PW_F145_MF021 times 2 dw F_1_451,-F_0_211
-PW_F362_MF127 times 2 dw F_3_624,-F_1_272
-PW_F085_MF072 times 2 dw F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP times 8 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_mmx (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 2
-%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
- ; JCOEF workspace[DCTSIZE2]
-
- align 16
- global EXTN(jsimd_idct_4x4_mmx)
-
-EXTN(jsimd_idct_4x4_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [workspace]
- pushpic ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input, store into work array.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
- lea edi, [workspace] ; JCOEF *wsptr
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz short .columnDCT
-
- movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por mm0,mm1
- packsswb mm0,mm0
- movd eax,mm0
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw mm0,PASS1_BITS
-
- movq mm2,mm0 ; mm0=in0=(00 01 02 03)
- punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
- punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
-
- movq mm1,mm0
- punpckldq mm0,mm0 ; mm0=(00 00 00 00)
- punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
- movq mm3,mm2
- punpckldq mm2,mm2 ; mm2=(02 02 02 02)
- punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
- movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
- movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
- jmp near .nextcolumn
- alignx 16,7
-%endif
-.columnDCT:
-
- ; -- Odd part
-
- movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movq mm4,mm0
- movq mm5,mm0
- punpcklwd mm4,mm1
- punpckhwd mm5,mm1
- movq mm0,mm4
- movq mm1,mm5
- pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
- pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
- pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
- pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
-
- movq mm6,mm2
- movq mm7,mm2
- punpcklwd mm6,mm3
- punpckhwd mm7,mm3
- movq mm2,mm6
- movq mm3,mm7
- pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
- pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
- pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
- pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
-
- paddd mm6,mm4 ; mm6=tmp2L
- paddd mm7,mm5 ; mm7=tmp2H
- paddd mm2,mm0 ; mm2=tmp0L
- paddd mm3,mm1 ; mm3=tmp0H
-
- movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
- movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
-
- ; -- Even part
-
- movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- pxor mm1,mm1
- pxor mm2,mm2
- punpcklwd mm1,mm4 ; mm1=tmp0L
- punpckhwd mm2,mm4 ; mm2=tmp0H
- psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
- psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
- movq mm3,mm5 ; mm5=in2=z2
- punpcklwd mm5,mm0 ; mm0=in6=z3
- punpckhwd mm3,mm0
- pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
- pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
-
- movq mm4,mm1
- movq mm0,mm2
- paddd mm1,mm5 ; mm1=tmp10L
- paddd mm2,mm3 ; mm2=tmp10H
- psubd mm4,mm5 ; mm4=tmp12L
- psubd mm0,mm3 ; mm0=tmp12H
-
- ; -- Final output stage
-
- movq mm5,mm1
- movq mm3,mm2
- paddd mm1,mm6 ; mm1=data0L
- paddd mm2,mm7 ; mm2=data0H
- psubd mm5,mm6 ; mm5=data3L
- psubd mm3,mm7 ; mm3=data3H
-
- movq mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4]
-
- paddd mm1,mm6
- paddd mm2,mm6
- psrad mm1,DESCALE_P1_4
- psrad mm2,DESCALE_P1_4
- paddd mm5,mm6
- paddd mm3,mm6
- psrad mm5,DESCALE_P1_4
- psrad mm3,DESCALE_P1_4
-
- packssdw mm1,mm2 ; mm1=data0=(00 01 02 03)
- packssdw mm5,mm3 ; mm5=data3=(30 31 32 33)
-
- movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
- movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
-
- movq mm2,mm4
- movq mm3,mm0
- paddd mm4,mm7 ; mm4=data1L
- paddd mm0,mm6 ; mm0=data1H
- psubd mm2,mm7 ; mm2=data2L
- psubd mm3,mm6 ; mm3=data2H
-
- movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4]
-
- paddd mm4,mm7
- paddd mm0,mm7
- psrad mm4,DESCALE_P1_4
- psrad mm0,DESCALE_P1_4
- paddd mm2,mm7
- paddd mm3,mm7
- psrad mm2,DESCALE_P1_4
- psrad mm3,DESCALE_P1_4
-
- packssdw mm4,mm0 ; mm4=data1=(10 11 12 13)
- packssdw mm2,mm3 ; mm2=data2=(20 21 22 23)
-
- movq mm6,mm1 ; transpose coefficients(phase 1)
- punpcklwd mm1,mm4 ; mm1=(00 10 01 11)
- punpckhwd mm6,mm4 ; mm6=(02 12 03 13)
- movq mm7,mm2 ; transpose coefficients(phase 1)
- punpcklwd mm2,mm5 ; mm2=(20 30 21 31)
- punpckhwd mm7,mm5 ; mm7=(22 32 23 33)
-
- movq mm0,mm1 ; transpose coefficients(phase 2)
- punpckldq mm1,mm2 ; mm1=(00 10 20 30)
- punpckhdq mm0,mm2 ; mm0=(01 11 21 31)
- movq mm3,mm6 ; transpose coefficients(phase 2)
- punpckldq mm6,mm7 ; mm6=(02 12 22 32)
- punpckhdq mm3,mm7 ; mm3=(03 13 23 33)
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
- movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-
-.nextcolumn:
- add esi, byte 4*SIZEOF_JCOEF ; coef_block
- add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
- add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
- dec ecx ; ctr
- jnz near .columnloop
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- lea esi, [workspace] ; JCOEF *wsptr
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
-
- ; -- Odd part
-
- movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
- movq mm4,mm0
- movq mm5,mm0
- punpcklwd mm4,mm1
- punpckhwd mm5,mm1
- movq mm0,mm4
- movq mm1,mm5
- pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
- pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
- pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
- pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
-
- movq mm6,mm2
- movq mm7,mm2
- punpcklwd mm6,mm3
- punpckhwd mm7,mm3
- movq mm2,mm6
- movq mm3,mm7
- pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
- pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
- pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
- pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
-
- paddd mm6,mm4 ; mm6=tmp2L
- paddd mm7,mm5 ; mm7=tmp2H
- paddd mm2,mm0 ; mm2=tmp0L
- paddd mm3,mm1 ; mm3=tmp0H
-
- movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
- movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
-
- ; -- Even part
-
- movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
- pxor mm1,mm1
- pxor mm2,mm2
- punpcklwd mm1,mm4 ; mm1=tmp0L
- punpckhwd mm2,mm4 ; mm2=tmp0H
- psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
- psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
- movq mm3,mm5 ; mm5=in2=z2
- punpcklwd mm5,mm0 ; mm0=in6=z3
- punpckhwd mm3,mm0
- pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
- pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
-
- movq mm4,mm1
- movq mm0,mm2
- paddd mm1,mm5 ; mm1=tmp10L
- paddd mm2,mm3 ; mm2=tmp10H
- psubd mm4,mm5 ; mm4=tmp12L
- psubd mm0,mm3 ; mm0=tmp12H
-
- ; -- Final output stage
-
- movq mm5,mm1
- movq mm3,mm2
- paddd mm1,mm6 ; mm1=data0L
- paddd mm2,mm7 ; mm2=data0H
- psubd mm5,mm6 ; mm5=data3L
- psubd mm3,mm7 ; mm3=data3H
-
- movq mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4]
-
- paddd mm1,mm6
- paddd mm2,mm6
- psrad mm1,DESCALE_P2_4
- psrad mm2,DESCALE_P2_4
- paddd mm5,mm6
- paddd mm3,mm6
- psrad mm5,DESCALE_P2_4
- psrad mm3,DESCALE_P2_4
-
- packssdw mm1,mm2 ; mm1=data0=(00 10 20 30)
- packssdw mm5,mm3 ; mm5=data3=(03 13 23 33)
-
- movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
- movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
-
- movq mm2,mm4
- movq mm3,mm0
- paddd mm4,mm7 ; mm4=data1L
- paddd mm0,mm6 ; mm0=data1H
- psubd mm2,mm7 ; mm2=data2L
- psubd mm3,mm6 ; mm3=data2H
-
- movq mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4]
-
- paddd mm4,mm7
- paddd mm0,mm7
- psrad mm4,DESCALE_P2_4
- psrad mm0,DESCALE_P2_4
- paddd mm2,mm7
- paddd mm3,mm7
- psrad mm2,DESCALE_P2_4
- psrad mm3,DESCALE_P2_4
-
- packssdw mm4,mm0 ; mm4=data1=(01 11 21 31)
- packssdw mm2,mm3 ; mm2=data2=(02 12 22 32)
-
- movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
-
- packsswb mm1,mm2 ; mm1=(00 10 20 30 02 12 22 32)
- packsswb mm4,mm5 ; mm4=(01 11 21 31 03 13 23 33)
- paddb mm1,mm6
- paddb mm4,mm6
-
- movq mm7,mm1 ; transpose coefficients(phase 1)
- punpcklbw mm1,mm4 ; mm1=(00 01 10 11 20 21 30 31)
- punpckhbw mm7,mm4 ; mm7=(02 03 12 13 22 23 32 33)
-
- movq mm0,mm1 ; transpose coefficients(phase 2)
- punpcklwd mm1,mm7 ; mm1=(00 01 02 03 10 11 12 13)
- punpckhwd mm0,mm7 ; mm0=(20 21 22 23 30 31 32 33)
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
- movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
- psrlq mm1,4*BYTE_BIT
- psrlq mm0,4*BYTE_BIT
-
- mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
- movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
- emms ; empty MMX state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_mmx (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
- align 16
- global EXTN(jsimd_idct_2x2_mmx)
-
-EXTN(jsimd_idct_2x2_mmx):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input.
-
- mov edx, POINTER [dct_table(ebp)] ; quantptr
- mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
-
- ; | input: | result: |
- ; | 00 01 ** 03 ** 05 ** 07 | |
- ; | 10 11 ** 13 ** 15 ** 17 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
- ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
- ; | 50 51 ** 53 ** 55 ** 57 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 70 71 ** 73 ** 75 ** 77 | |
-
- ; -- Odd part
-
- movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
- ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
-
- pcmpeqd mm7,mm7
- pslld mm7,WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
-
- movq mm4,mm0 ; mm4=(10 11 ** 13)
- movq mm5,mm2 ; mm5=(50 51 ** 53)
- punpcklwd mm4,mm1 ; mm4=(10 30 11 31)
- punpcklwd mm5,mm3 ; mm5=(50 70 51 71)
- pmaddwd mm4,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
- psrld mm0,WORD_BIT ; mm0=(11 -- 13 --)
- pand mm1,mm7 ; mm1=(-- 31 -- 33)
- psrld mm2,WORD_BIT ; mm2=(51 -- 53 --)
- pand mm3,mm7 ; mm3=(-- 71 -- 73)
- por mm0,mm1 ; mm0=(11 31 13 33)
- por mm2,mm3 ; mm2=(51 71 53 73)
- pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
-
- paddd mm4,mm5 ; mm4=tmp0[col0 col1]
-
- movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
- movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
- pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
- movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
- pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
- ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
-
- psrld mm6,WORD_BIT ; mm6=(15 -- 17 --)
- pand mm1,mm7 ; mm1=(-- 35 -- 37)
- psrld mm3,WORD_BIT ; mm3=(55 -- 57 --)
- pand mm5,mm7 ; mm5=(-- 75 -- 77)
- por mm6,mm1 ; mm6=(15 35 17 37)
- por mm3,mm5 ; mm3=(55 75 57 77)
- pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
-
- paddd mm0,mm2 ; mm0=tmp0[col1 col3]
- paddd mm6,mm3 ; mm6=tmp0[col5 col7]
-
- ; -- Even part
-
- movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
- pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
-
- movq mm2,mm1 ; mm2=(00 01 ** 03)
- pslld mm1,WORD_BIT ; mm1=(-- 00 -- **)
- psrad mm1,(WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****]
-
- pand mm2,mm7 ; mm2=(-- 01 -- 03)
- pand mm5,mm7 ; mm5=(-- 05 -- 07)
- psrad mm2,(WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3]
- psrad mm5,(WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7]
-
- ; -- Final output stage
-
- movq mm3,mm1
- paddd mm1,mm4 ; mm1=data0[col0 ****]=(A0 **)
- psubd mm3,mm4 ; mm3=data1[col0 ****]=(B0 **)
- punpckldq mm1,mm3 ; mm1=(A0 B0)
-
- movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2]
-
- movq mm4,mm2
- movq mm3,mm5
- paddd mm2,mm0 ; mm2=data0[col1 col3]=(A1 A3)
- paddd mm5,mm6 ; mm5=data0[col5 col7]=(A5 A7)
- psubd mm4,mm0 ; mm4=data1[col1 col3]=(B1 B3)
- psubd mm3,mm6 ; mm3=data1[col5 col7]=(B5 B7)
-
- paddd mm1,mm7
- psrad mm1,DESCALE_P1_2
-
- paddd mm2,mm7
- paddd mm5,mm7
- psrad mm2,DESCALE_P1_2
- psrad mm5,DESCALE_P1_2
- paddd mm4,mm7
- paddd mm3,mm7
- psrad mm4,DESCALE_P1_2
- psrad mm3,DESCALE_P1_2
-
- ; ---- Pass 2: process rows, store into output array.
-
- mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(ebp)]
-
- ; | input:| result:|
- ; | A0 B0 | |
- ; | A1 B1 | C0 C1 |
- ; | A3 B3 | D0 D1 |
- ; | A5 B5 | |
- ; | A7 B7 | |
-
- ; -- Odd part
-
- packssdw mm2,mm4 ; mm2=(A1 A3 B1 B3)
- packssdw mm5,mm3 ; mm5=(A5 A7 B5 B7)
- pmaddwd mm2,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
- paddd mm2,mm5 ; mm2=tmp0[row0 row1]
-
- ; -- Even part
-
- pslld mm1,(CONST_BITS+2) ; mm1=tmp10[row0 row1]
-
- ; -- Final output stage
-
- movq mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2]
-
- movq mm6,mm1
- paddd mm1,mm2 ; mm1=data0[row0 row1]=(C0 C1)
- psubd mm6,mm2 ; mm6=data1[row0 row1]=(D0 D1)
-
- paddd mm1,mm0
- paddd mm6,mm0
- psrad mm1,DESCALE_P2_2
- psrad mm6,DESCALE_P2_2
-
- movq mm7,mm1 ; transpose coefficients
- punpckldq mm1,mm6 ; mm1=(C0 D0)
- punpckhdq mm7,mm6 ; mm7=(C1 D1)
-
- packssdw mm1,mm7 ; mm1=(C0 D0 C1 D1)
- packsswb mm1,mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
- paddb mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
- movd ecx,mm1
- movd ebx,mm1 ; ebx=(C0 D0 C1 D1)
- shr ecx,2*BYTE_BIT ; ecx=(C1 D1 -- --)
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
- mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
- emms ; empty MMX state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctred-sse2-64.asm b/media/libjpeg/simd/jidctred-sse2-64.asm
deleted file mode 100644
index a54bbe24e0..0000000000
--- a/media/libjpeg/simd/jidctred-sse2-64.asm
+++ /dev/null
@@ -1,575 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 13
-%define PASS1_BITS 2
-
-%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ 1730 ; FIX(0.211164243)
-F_0_509 equ 4176 ; FIX(0.509795579)
-F_0_601 equ 4926 ; FIX(0.601344887)
-F_0_720 equ 5906 ; FIX(0.720959822)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_850 equ 6967 ; FIX(0.850430095)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_061 equ 8697 ; FIX(1.061594337)
-F_1_272 equ 10426 ; FIX(1.272758580)
-F_1_451 equ 11893 ; FIX(1.451774981)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_2_172 equ 17799 ; FIX(2.172734803)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_624 equ 29692 ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
-F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
-F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
-F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
-F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
-F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_red_sse2)
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076 times 4 dw F_1_847,-F_0_765
-PW_F256_F089 times 4 dw F_2_562, F_0_899
-PW_F106_MF217 times 4 dw F_1_061,-F_2_172
-PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021 times 4 dw F_1_451,-F_0_211
-PW_F362_MF127 times 4 dw F_3_624,-F_1_272
-PW_F085_MF072 times 4 dw F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp rbp+0
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_idct_4x4_sse2)
-
-EXTN(jsimd_idct_4x4_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
-
- ; ---- Pass 1: process columns from input.
-
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- jnz short .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- por xmm0,xmm1
- packsswb xmm0,xmm0
- packsswb xmm0,xmm0
- movd eax,xmm0
- test rax,rax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw xmm0,PASS1_BITS
-
- movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
-
- pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
- pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
- pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
- pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
- jmp near .column_end
-%endif
-.columnDCT:
-
- ; -- Odd part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm4,xmm0
- movdqa xmm5,xmm0
- punpcklwd xmm4,xmm1
- punpckhwd xmm5,xmm1
- movdqa xmm0,xmm4
- movdqa xmm1,xmm5
- pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L)
- pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H)
- pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L)
- pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H)
-
- movdqa xmm6,xmm2
- movdqa xmm7,xmm2
- punpcklwd xmm6,xmm3
- punpckhwd xmm7,xmm3
- movdqa xmm2,xmm6
- movdqa xmm3,xmm7
- pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L)
- pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H)
- pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L)
- pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H)
-
- paddd xmm6,xmm4 ; xmm6=tmp2L
- paddd xmm7,xmm5 ; xmm7=tmp2H
- paddd xmm2,xmm0 ; xmm2=tmp0L
- paddd xmm3,xmm1 ; xmm3=tmp0H
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
-
- ; -- Even part
-
- movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- pxor xmm1,xmm1
- pxor xmm2,xmm2
- punpcklwd xmm1,xmm4 ; xmm1=tmp0L
- punpckhwd xmm2,xmm4 ; xmm2=tmp0H
- psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
- psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
- movdqa xmm3,xmm5 ; xmm5=in2=z2
- punpcklwd xmm5,xmm0 ; xmm0=in6=z3
- punpckhwd xmm3,xmm0
- pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L
- pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H
-
- movdqa xmm4,xmm1
- movdqa xmm0,xmm2
- paddd xmm1,xmm5 ; xmm1=tmp10L
- paddd xmm2,xmm3 ; xmm2=tmp10H
- psubd xmm4,xmm5 ; xmm4=tmp12L
- psubd xmm0,xmm3 ; xmm0=tmp12H
-
- ; -- Final output stage
-
- movdqa xmm5,xmm1
- movdqa xmm3,xmm2
- paddd xmm1,xmm6 ; xmm1=data0L
- paddd xmm2,xmm7 ; xmm2=data0H
- psubd xmm5,xmm6 ; xmm5=data3L
- psubd xmm3,xmm7 ; xmm3=data3H
-
- movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
-
- paddd xmm1,xmm6
- paddd xmm2,xmm6
- psrad xmm1,DESCALE_P1_4
- psrad xmm2,DESCALE_P1_4
- paddd xmm5,xmm6
- paddd xmm3,xmm6
- psrad xmm5,DESCALE_P1_4
- psrad xmm3,DESCALE_P1_4
-
- packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
- packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
-
- movdqa xmm2,xmm4
- movdqa xmm3,xmm0
- paddd xmm4,xmm7 ; xmm4=data1L
- paddd xmm0,xmm6 ; xmm0=data1H
- psubd xmm2,xmm7 ; xmm2=data2L
- psubd xmm3,xmm6 ; xmm3=data2H
-
- movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
-
- paddd xmm4,xmm7
- paddd xmm0,xmm7
- psrad xmm4,DESCALE_P1_4
- psrad xmm0,DESCALE_P1_4
- paddd xmm2,xmm7
- paddd xmm3,xmm7
- psrad xmm2,DESCALE_P1_4
- psrad xmm3,DESCALE_P1_4
-
- packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
- packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
- movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
- punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
- movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
-
- movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
- punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
- movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
- punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows, store into output array.
-
- mov rax, [original_rbp]
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
-
- ; -- Even part
-
- pxor xmm4,xmm4
- punpcklwd xmm4,xmm1 ; xmm4=tmp0
- psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
- ; -- Odd part
-
- punpckhwd xmm1,xmm0
- punpckhwd xmm6,xmm3
- movdqa xmm5,xmm1
- movdqa xmm2,xmm6
- pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2)
- pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2)
- pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0)
- pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0)
-
- paddd xmm6,xmm1 ; xmm6=tmp2
- paddd xmm2,xmm5 ; xmm2=tmp0
-
- ; -- Even part
-
- punpcklwd xmm0,xmm3
- pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2
-
- movdqa xmm7,xmm4
- paddd xmm4,xmm0 ; xmm4=tmp10
- psubd xmm7,xmm0 ; xmm7=tmp12
-
- ; -- Final output stage
-
- movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
-
- movdqa xmm5,xmm4
- movdqa xmm3,xmm7
- paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
- paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
- psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
- psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
-
- paddd xmm4,xmm1
- paddd xmm7,xmm1
- psrad xmm4,DESCALE_P2_4
- psrad xmm7,DESCALE_P2_4
- paddd xmm5,xmm1
- paddd xmm3,xmm1
- psrad xmm5,DESCALE_P2_4
- psrad xmm3,DESCALE_P2_4
-
- packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
- packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
-
- movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
- punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
- punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
-
- movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
- punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
-
- packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
- paddb xmm4,[rel PB_CENTERJSAMP]
-
- pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
- pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
- pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
- movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
- mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
- movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
- movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
- align 16
- global EXTN(jsimd_idct_2x2_sse2)
-
-EXTN(jsimd_idct_2x2_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- ; ---- Pass 1: process columns from input.
-
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
-
- ; | input: | result: |
- ; | 00 01 ** 03 ** 05 ** 07 | |
- ; | 10 11 ** 13 ** 15 ** 17 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
- ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
- ; | 50 51 ** 53 ** 55 ** 57 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 70 71 ** 73 ** 75 ** 77 | |
-
- ; -- Odd part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
- ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
- pcmpeqd xmm7,xmm7
- pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
- movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
- movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
- punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
- punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
- pmaddwd xmm4,[rel PW_F362_MF127]
- pmaddwd xmm5,[rel PW_F085_MF072]
-
- psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
- pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
- psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
- pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
- por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
- por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
- pmaddwd xmm0,[rel PW_F362_MF127]
- pmaddwd xmm2,[rel PW_F085_MF072]
-
- paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
- paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
-
- ; -- Even part
-
- movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; xmm6=(00 01 ** 03 ** 05 ** 07)
-
- movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
- pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
- pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
- psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
- psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
- ; -- Final output stage
-
- movdqa xmm3,xmm6
- movdqa xmm5,xmm1
- paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
- paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
- psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
- psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
- movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
-
- punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
-
- movdqa xmm7,xmm1
- punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
- punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
-
- paddd xmm6,xmm2
- psrad xmm6,DESCALE_P1_2
-
- paddd xmm1,xmm2
- paddd xmm7,xmm2
- psrad xmm1,DESCALE_P1_2
- psrad xmm7,DESCALE_P1_2
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows, store into output array.
-
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
-
- ; | input:| result:|
- ; | A0 B0 | |
- ; | A1 B1 | C0 C1 |
- ; | A3 B3 | D0 D1 |
- ; | A5 B5 | |
- ; | A7 B7 | |
-
- ; -- Odd part
-
- packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
- packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
- pmaddwd xmm1,[rel PW_F362_MF127]
- pmaddwd xmm7,[rel PW_F085_MF072]
-
- paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
-
- ; -- Even part
-
- pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
-
- ; -- Final output stage
-
- movdqa xmm4,xmm6
- paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
- psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
- punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
-
- paddd xmm6,[rel PD_DESCALE_P2_2]
- psrad xmm6,DESCALE_P2_2
-
- packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
- packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
- paddb xmm6,[rel PB_CENTERJSAMP]
-
- pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
- pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
-
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
- mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
-
- pop rbx
- uncollect_args
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jidctred-sse2.asm b/media/libjpeg/simd/jidctred-sse2.asm
deleted file mode 100644
index 232d9838d6..0000000000
--- a/media/libjpeg/simd/jidctred-sse2.asm
+++ /dev/null
@@ -1,593 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS 13
-%define PASS1_BITS 2
-
-%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ 1730 ; FIX(0.211164243)
-F_0_509 equ 4176 ; FIX(0.509795579)
-F_0_601 equ 4926 ; FIX(0.601344887)
-F_0_720 equ 5906 ; FIX(0.720959822)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_850 equ 6967 ; FIX(0.850430095)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_061 equ 8697 ; FIX(1.061594337)
-F_1_272 equ 10426 ; FIX(1.272758580)
-F_1_451 equ 11893 ; FIX(1.451774981)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_2_172 equ 17799 ; FIX(2.172734803)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_624 equ 29692 ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
-F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
-F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
-F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
-F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
-F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
- SECTION SEG_CONST
-
- alignz 16
- global EXTN(jconst_idct_red_sse2)
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076 times 4 dw F_1_847,-F_0_765
-PW_F256_F089 times 4 dw F_2_562, F_0_899
-PW_F106_MF217 times 4 dw F_1_061,-F_2_172
-PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021 times 4 dw F_1_451,-F_0_211
-PW_F362_MF127 times 4 dw F_3_624,-F_1_272
-PW_F085_MF072 times 4 dw F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-
- align 16
- global EXTN(jsimd_idct_4x4_sse2)
-
-EXTN(jsimd_idct_4x4_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz short .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por xmm0,xmm1
- packsswb xmm0,xmm0
- packsswb xmm0,xmm0
- movd eax,xmm0
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw xmm0,PASS1_BITS
-
- movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
-
- pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
- pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
- pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
- pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
- jmp near .column_end
- alignx 16,7
-%endif
-.columnDCT:
-
- ; -- Odd part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm4,xmm0
- movdqa xmm5,xmm0
- punpcklwd xmm4,xmm1
- punpckhwd xmm5,xmm1
- movdqa xmm0,xmm4
- movdqa xmm1,xmm5
- pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
- pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
- pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
- pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
-
- movdqa xmm6,xmm2
- movdqa xmm7,xmm2
- punpcklwd xmm6,xmm3
- punpckhwd xmm7,xmm3
- movdqa xmm2,xmm6
- movdqa xmm3,xmm7
- pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
- pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
- pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
-
- paddd xmm6,xmm4 ; xmm6=tmp2L
- paddd xmm7,xmm5 ; xmm7=tmp2H
- paddd xmm2,xmm0 ; xmm2=tmp0L
- paddd xmm3,xmm1 ; xmm3=tmp0H
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
-
- ; -- Even part
-
- movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- pxor xmm1,xmm1
- pxor xmm2,xmm2
- punpcklwd xmm1,xmm4 ; xmm1=tmp0L
- punpckhwd xmm2,xmm4 ; xmm2=tmp0H
- psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
- psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
- movdqa xmm3,xmm5 ; xmm5=in2=z2
- punpcklwd xmm5,xmm0 ; xmm0=in6=z3
- punpckhwd xmm3,xmm0
- pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
- pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
-
- movdqa xmm4,xmm1
- movdqa xmm0,xmm2
- paddd xmm1,xmm5 ; xmm1=tmp10L
- paddd xmm2,xmm3 ; xmm2=tmp10H
- psubd xmm4,xmm5 ; xmm4=tmp12L
- psubd xmm0,xmm3 ; xmm0=tmp12H
-
- ; -- Final output stage
-
- movdqa xmm5,xmm1
- movdqa xmm3,xmm2
- paddd xmm1,xmm6 ; xmm1=data0L
- paddd xmm2,xmm7 ; xmm2=data0H
- psubd xmm5,xmm6 ; xmm5=data3L
- psubd xmm3,xmm7 ; xmm3=data3H
-
- movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
-
- paddd xmm1,xmm6
- paddd xmm2,xmm6
- psrad xmm1,DESCALE_P1_4
- psrad xmm2,DESCALE_P1_4
- paddd xmm5,xmm6
- paddd xmm3,xmm6
- psrad xmm5,DESCALE_P1_4
- psrad xmm3,DESCALE_P1_4
-
- packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
- packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
-
- movdqa xmm2,xmm4
- movdqa xmm3,xmm0
- paddd xmm4,xmm7 ; xmm4=data1L
- paddd xmm0,xmm6 ; xmm0=data1H
- psubd xmm2,xmm7 ; xmm2=data2L
- psubd xmm3,xmm6 ; xmm3=data2H
-
- movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
-
- paddd xmm4,xmm7
- paddd xmm0,xmm7
- psrad xmm4,DESCALE_P1_4
- psrad xmm0,DESCALE_P1_4
- paddd xmm2,xmm7
- paddd xmm3,xmm7
- psrad xmm2,DESCALE_P1_4
- psrad xmm3,DESCALE_P1_4
-
- packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
- packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
- movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
- punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
- movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
-
- movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
- punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
- movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
- punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows, store into output array.
-
- mov eax, [original_ebp]
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
-
- ; -- Even part
-
- pxor xmm4,xmm4
- punpcklwd xmm4,xmm1 ; xmm4=tmp0
- psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
- ; -- Odd part
-
- punpckhwd xmm1,xmm0
- punpckhwd xmm6,xmm3
- movdqa xmm5,xmm1
- movdqa xmm2,xmm6
- pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
- pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
- pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
- pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
-
- paddd xmm6,xmm1 ; xmm6=tmp2
- paddd xmm2,xmm5 ; xmm2=tmp0
-
- ; -- Even part
-
- punpcklwd xmm0,xmm3
- pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
-
- movdqa xmm7,xmm4
- paddd xmm4,xmm0 ; xmm4=tmp10
- psubd xmm7,xmm0 ; xmm7=tmp12
-
- ; -- Final output stage
-
- movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
-
- movdqa xmm5,xmm4
- movdqa xmm3,xmm7
- paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
- paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
- psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
- psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
-
- paddd xmm4,xmm1
- paddd xmm7,xmm1
- psrad xmm4,DESCALE_P2_4
- psrad xmm7,DESCALE_P2_4
- paddd xmm5,xmm1
- paddd xmm3,xmm1
- psrad xmm5,DESCALE_P2_4
- psrad xmm3,DESCALE_P2_4
-
- packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
- packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
-
- movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
- punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
- punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
-
- movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
- punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
-
- packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
- paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
- pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
- pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
- pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
- movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
- mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
- movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
-; JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
-
- align 16
- global EXTN(jsimd_idct_2x2_sse2)
-
-EXTN(jsimd_idct_2x2_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input.
-
- mov edx, POINTER [dct_table(ebp)] ; quantptr
- mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
-
- ; | input: | result: |
- ; | 00 01 ** 03 ** 05 ** 07 | |
- ; | 10 11 ** 13 ** 15 ** 17 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
- ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
- ; | 50 51 ** 53 ** 55 ** 57 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 70 71 ** 73 ** 75 ** 77 | |
-
- ; -- Odd part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
- ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
- pcmpeqd xmm7,xmm7
- pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
- movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
- movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
- punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
- punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
- pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)]
-
- psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
- pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
- psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
- pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
- por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
- por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
- pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)]
-
- paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
- paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
-
- ; -- Even part
-
- movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; xmm6=(00 01 ** 03 ** 05 ** 07)
-
- movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
- pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
- pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
- psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
- psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
- ; -- Final output stage
-
- movdqa xmm3,xmm6
- movdqa xmm5,xmm1
- paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
- paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
- psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
- psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
- movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
-
- punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
-
- movdqa xmm7,xmm1
- punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
- punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
-
- paddd xmm6,xmm2
- psrad xmm6,DESCALE_P1_2
-
- paddd xmm1,xmm2
- paddd xmm7,xmm2
- psrad xmm1,DESCALE_P1_2
- psrad xmm7,DESCALE_P1_2
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows, store into output array.
-
- mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(ebp)]
-
- ; | input:| result:|
- ; | A0 B0 | |
- ; | A1 B1 | C0 C1 |
- ; | A3 B3 | D0 D1 |
- ; | A5 B5 | |
- ; | A7 B7 | |
-
- ; -- Odd part
-
- packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
- packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
- pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)]
-
- paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
-
- ; -- Even part
-
- pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
-
- ; -- Final output stage
-
- movdqa xmm4,xmm6
- paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
- psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
- punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
-
- paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
- psrad xmm6,DESCALE_P2_2
-
- packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
- packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
- paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
- pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
- pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
- mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jpeg_nbits_table.inc b/media/libjpeg/simd/jpeg_nbits_table.inc
deleted file mode 100644
index cbc69904e1..0000000000
--- a/media/libjpeg/simd/jpeg_nbits_table.inc
+++ /dev/null
@@ -1,4097 +0,0 @@
-jpeg_nbits_table db \
- 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, \
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, \
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
diff --git a/media/libjpeg/simd/jquant-3dn.asm b/media/libjpeg/simd/jquant-3dn.asm
deleted file mode 100644
index 0b4164b261..0000000000
--- a/media/libjpeg/simd/jquant-3dn.asm
+++ /dev/null
@@ -1,232 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
-; FAST_FLOAT *workspace);
-;
-
-%define sample_data ebp+8 ; JSAMPARRAY sample_data
-%define start_col ebp+12 ; JDIMENSION start_col
-%define workspace ebp+16 ; FAST_FLOAT *workspace
-
- align 16
- global EXTN(jsimd_convsamp_float_3dnow)
-
-EXTN(jsimd_convsamp_float_3dnow):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- pcmpeqw mm7,mm7
- psllw mm7,7
- packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
- mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
- mov eax, JDIMENSION [start_col]
- mov edi, POINTER [workspace] ; (DCTELEM *)
- mov ecx, DCTSIZE/2
- alignx 16,7
-.convloop:
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
- movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
- psubb mm0,mm7 ; mm0=(01234567)
- psubb mm1,mm7 ; mm1=(89ABCDEF)
-
- punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)
- punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)
- punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)
- punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)
-
- punpcklwd mm4,mm2 ; mm4=(***0***1)
- punpckhwd mm2,mm2 ; mm2=(***2***3)
- punpcklwd mm5,mm0 ; mm5=(***4***5)
- punpckhwd mm0,mm0 ; mm0=(***6***7)
-
- psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
- psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
- pi2fd mm4,mm4
- pi2fd mm2,mm2
- psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
- psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
- pi2fd mm5,mm5
- pi2fd mm0,mm0
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
- movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
- movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-
- punpcklwd mm6,mm3 ; mm6=(***8***9)
- punpckhwd mm3,mm3 ; mm3=(***A***B)
- punpcklwd mm4,mm1 ; mm4=(***C***D)
- punpckhwd mm1,mm1 ; mm1=(***E***F)
-
- psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
- psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
- pi2fd mm6,mm6
- pi2fd mm3,mm3
- psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
- psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
- pi2fd mm4,mm4
- pi2fd mm1,mm1
-
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
- movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
- movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-
- add esi, byte 2*SIZEOF_JSAMPROW
- add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
- dec ecx
- jnz near .convloop
-
- femms ; empty MMX/3DNow! state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-; FAST_FLOAT *workspace);
-;
-
-%define coef_block ebp+8 ; JCOEFPTR coef_block
-%define divisors ebp+12 ; FAST_FLOAT *divisors
-%define workspace ebp+16 ; FAST_FLOAT *workspace
-
- align 16
- global EXTN(jsimd_quantize_float_3dnow)
-
-EXTN(jsimd_quantize_float_3dnow):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic)
- movd mm7,eax
- punpckldq mm7,mm7 ; mm7={12582912.0F 12582912.0F}
-
- mov esi, POINTER [workspace]
- mov edx, POINTER [divisors]
- mov edi, JCOEFPTR [coef_block]
- mov eax, DCTSIZE2/16
- alignx 16,7
-.quantloop:
- movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
- movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
- pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
- movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
- movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
- pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
- pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-
- pfadd mm0,mm7 ; mm0=(00 ** 01 **)
- pfadd mm1,mm7 ; mm1=(02 ** 03 **)
- pfadd mm2,mm7 ; mm0=(04 ** 05 **)
- pfadd mm3,mm7 ; mm1=(06 ** 07 **)
-
- movq mm4,mm0
- punpcklwd mm0,mm1 ; mm0=(00 02 ** **)
- punpckhwd mm4,mm1 ; mm4=(01 03 ** **)
- movq mm5,mm2
- punpcklwd mm2,mm3 ; mm2=(04 06 ** **)
- punpckhwd mm5,mm3 ; mm5=(05 07 ** **)
-
- punpcklwd mm0,mm4 ; mm0=(00 01 02 03)
- punpcklwd mm2,mm5 ; mm2=(04 05 06 07)
-
- movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
- movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
- pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
- movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
- movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
- pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
- pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
- pfadd mm6,mm7 ; mm0=(10 ** 11 **)
- pfadd mm1,mm7 ; mm4=(12 ** 13 **)
- pfadd mm3,mm7 ; mm0=(14 ** 15 **)
- pfadd mm4,mm7 ; mm4=(16 ** 17 **)
-
- movq mm5,mm6
- punpcklwd mm6,mm1 ; mm6=(10 12 ** **)
- punpckhwd mm5,mm1 ; mm5=(11 13 ** **)
- movq mm1,mm3
- punpcklwd mm3,mm4 ; mm3=(14 16 ** **)
- punpckhwd mm1,mm4 ; mm1=(15 17 ** **)
-
- punpcklwd mm6,mm5 ; mm6=(10 11 12 13)
- punpcklwd mm3,mm1 ; mm3=(14 15 16 17)
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
- add esi, byte 16*SIZEOF_FAST_FLOAT
- add edx, byte 16*SIZEOF_FAST_FLOAT
- add edi, byte 16*SIZEOF_JCOEF
- dec eax
- jnz near .quantloop
-
- femms ; empty MMX/3DNow! state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
-; pop ebx ; unused
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jquant-mmx.asm b/media/libjpeg/simd/jquant-mmx.asm
deleted file mode 100644
index aed6071bfc..0000000000
--- a/media/libjpeg/simd/jquant-mmx.asm
+++ /dev/null
@@ -1,273 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
-; DCTELEM *workspace);
-;
-
-%define sample_data ebp+8 ; JSAMPARRAY sample_data
-%define start_col ebp+12 ; JDIMENSION start_col
-%define workspace ebp+16 ; DCTELEM *workspace
-
- align 16
- global EXTN(jsimd_convsamp_mmx)
-
-EXTN(jsimd_convsamp_mmx):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- pxor mm6,mm6 ; mm6=(all 0's)
- pcmpeqw mm7,mm7
- psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-
- mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
- mov eax, JDIMENSION [start_col]
- mov edi, POINTER [workspace] ; (DCTELEM *)
- mov ecx, DCTSIZE/4
- alignx 16,7
-.convloop:
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
- movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
-
- mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
- movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
-
- movq mm4,mm0
- punpcklbw mm0,mm6 ; mm0=(0123)
- punpckhbw mm4,mm6 ; mm4=(4567)
- movq mm5,mm1
- punpcklbw mm1,mm6 ; mm1=(89AB)
- punpckhbw mm5,mm6 ; mm5=(CDEF)
-
- paddw mm0,mm7
- paddw mm4,mm7
- paddw mm1,mm7
- paddw mm5,mm7
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
-
- movq mm0,mm2
- punpcklbw mm2,mm6 ; mm2=(GHIJ)
- punpckhbw mm0,mm6 ; mm0=(KLMN)
- movq mm4,mm3
- punpcklbw mm3,mm6 ; mm3=(OPQR)
- punpckhbw mm4,mm6 ; mm4=(STUV)
-
- paddw mm2,mm7
- paddw mm0,mm7
- paddw mm3,mm7
- paddw mm4,mm7
-
- movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
- movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
- movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
- movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
-
- add esi, byte 4*SIZEOF_JSAMPROW
- add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
- dec ecx
- jnz short .convloop
-
- emms ; empty MMX state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-; "How to optimize for the Pentium family of microprocessors"
-; (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors,
-; DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-%define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block ebp+8 ; JCOEFPTR coef_block
-%define divisors ebp+12 ; DCTELEM *divisors
-%define workspace ebp+16 ; DCTELEM *workspace
-
- align 16
- global EXTN(jsimd_quantize_mmx)
-
-EXTN(jsimd_quantize_mmx):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov esi, POINTER [workspace]
- mov edx, POINTER [divisors]
- mov edi, JCOEFPTR [coef_block]
- mov ah, 2
- alignx 16,7
-.quantloop1:
- mov al, DCTSIZE2/8/2
- alignx 16,7
-.quantloop2:
- movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
-
- movq mm0,mm2
- movq mm1,mm3
-
- psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise
- psraw mm3,(WORD_BIT-1)
-
- pxor mm0,mm2 ; val = -val
- pxor mm1,mm3
- psubw mm0,mm2
- psubw mm1,mm3
-
- ;
- ; MMX is an annoyingly crappy instruction set. It has two
- ; misfeatures that are causing problems here:
- ;
- ; - All multiplications are signed.
- ;
- ; - The second operand for the shifts is not treated as packed.
- ;
- ;
- ; We work around the first problem by implementing this algorithm:
- ;
- ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
- ; {
- ; enum { SHORT_BIT = 16 };
- ; signed short sx = (signed short) x;
- ; signed short sy = (signed short) y;
- ; signed long sz;
- ;
- ; sz = (long) sx * (long) sy; /* signed multiply */
- ;
- ; if (sx < 0) sz += (long) sy << SHORT_BIT;
- ; if (sy < 0) sz += (long) sx << SHORT_BIT;
- ;
- ; return (unsigned long) sz;
- ; }
- ;
- ; (note that a negative sx adds _sy_ and vice versa)
- ;
- ; For the second problem, we replace the shift by a multiplication.
- ; Unfortunately that means we have to deal with the signed issue again.
- ;
-
- paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
- paddw mm1, MMWORD [CORRECTION(0,1,edx)]
-
- movq mm4,mm0 ; store current value for later
- movq mm5,mm1
- pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
- pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
- paddw mm0,mm4 ; reciprocal is always negative (MSB=1),
- paddw mm1,mm5 ; so we always need to add the initial value
- ; (input value is never negative as we
- ; inverted it at the start of this routine)
-
- ; here it gets a bit tricky as both scale
- ; and mm0/mm1 can be negative
- movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
- movq mm7, MMWORD [SCALE(0,1,edx)]
- movq mm4,mm0
- movq mm5,mm1
- pmulhw mm0,mm6
- pmulhw mm1,mm7
-
- psraw mm6,(WORD_BIT-1) ; determine if scale is negative
- psraw mm7,(WORD_BIT-1)
-
- pand mm6,mm4 ; and add input if it is
- pand mm7,mm5
- paddw mm0,mm6
- paddw mm1,mm7
-
- psraw mm4,(WORD_BIT-1) ; then check if negative input
- psraw mm5,(WORD_BIT-1)
-
- pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
- pand mm5, MMWORD [SCALE(0,1,edx)]
- paddw mm0,mm4
- paddw mm1,mm5
-
- pxor mm0,mm2 ; val = -val
- pxor mm1,mm3
- psubw mm0,mm2
- psubw mm1,mm3
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
-
- add esi, byte 8*SIZEOF_DCTELEM
- add edx, byte 8*SIZEOF_DCTELEM
- add edi, byte 8*SIZEOF_JCOEF
- dec al
- jnz near .quantloop2
- dec ah
- jnz near .quantloop1 ; to avoid branch misprediction
-
- emms ; empty MMX state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
-; pop ebx ; unused
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jquant-sse.asm b/media/libjpeg/simd/jquant-sse.asm
deleted file mode 100644
index 1baf88f257..0000000000
--- a/media/libjpeg/simd/jquant-sse.asm
+++ /dev/null
@@ -1,210 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
-; FAST_FLOAT *workspace);
-;
-
-%define sample_data ebp+8 ; JSAMPARRAY sample_data
-%define start_col ebp+12 ; JDIMENSION start_col
-%define workspace ebp+16 ; FAST_FLOAT *workspace
-
- align 16
- global EXTN(jsimd_convsamp_float_sse)
-
-EXTN(jsimd_convsamp_float_sse):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- pcmpeqw mm7,mm7
- psllw mm7,7
- packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
- mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
- mov eax, JDIMENSION [start_col]
- mov edi, POINTER [workspace] ; (DCTELEM *)
- mov ecx, DCTSIZE/2
- alignx 16,7
-.convloop:
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
- movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
- psubb mm0,mm7 ; mm0=(01234567)
- psubb mm1,mm7 ; mm1=(89ABCDEF)
-
- punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)
- punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)
- punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)
- punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)
-
- punpcklwd mm4,mm2 ; mm4=(***0***1)
- punpckhwd mm2,mm2 ; mm2=(***2***3)
- punpcklwd mm5,mm0 ; mm5=(***4***5)
- punpckhwd mm0,mm0 ; mm0=(***6***7)
-
- psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
- psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
- cvtpi2ps xmm0,mm4 ; xmm0=(01**)
- cvtpi2ps xmm1,mm2 ; xmm1=(23**)
- psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
- psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
- cvtpi2ps xmm2,mm5 ; xmm2=(45**)
- cvtpi2ps xmm3,mm0 ; xmm3=(67**)
-
- punpcklwd mm6,mm3 ; mm6=(***8***9)
- punpckhwd mm3,mm3 ; mm3=(***A***B)
- punpcklwd mm4,mm1 ; mm4=(***C***D)
- punpckhwd mm1,mm1 ; mm1=(***E***F)
-
- psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
- psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
- cvtpi2ps xmm4,mm6 ; xmm4=(89**)
- cvtpi2ps xmm5,mm3 ; xmm5=(AB**)
- psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
- psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
- cvtpi2ps xmm6,mm4 ; xmm6=(CD**)
- cvtpi2ps xmm7,mm1 ; xmm7=(EF**)
-
- movlhps xmm0,xmm1 ; xmm0=(0123)
- movlhps xmm2,xmm3 ; xmm2=(4567)
- movlhps xmm4,xmm5 ; xmm4=(89AB)
- movlhps xmm6,xmm7 ; xmm6=(CDEF)
-
- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-
- add esi, byte 2*SIZEOF_JSAMPROW
- add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
- dec ecx
- jnz near .convloop
-
- emms ; empty MMX state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-; FAST_FLOAT *workspace);
-;
-
-%define coef_block ebp+8 ; JCOEFPTR coef_block
-%define divisors ebp+12 ; FAST_FLOAT *divisors
-%define workspace ebp+16 ; FAST_FLOAT *workspace
-
- align 16
- global EXTN(jsimd_quantize_float_sse)
-
-EXTN(jsimd_quantize_float_sse):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov esi, POINTER [workspace]
- mov edx, POINTER [divisors]
- mov edi, JCOEFPTR [coef_block]
- mov eax, DCTSIZE2/16
- alignx 16,7
-.quantloop:
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
- movhlps xmm4,xmm0
- movhlps xmm5,xmm1
-
- cvtps2pi mm0,xmm0
- cvtps2pi mm1,xmm1
- cvtps2pi mm4,xmm4
- cvtps2pi mm5,xmm5
-
- movhlps xmm6,xmm2
- movhlps xmm7,xmm3
-
- cvtps2pi mm2,xmm2
- cvtps2pi mm3,xmm3
- cvtps2pi mm6,xmm6
- cvtps2pi mm7,xmm7
-
- packssdw mm0,mm4
- packssdw mm1,mm5
- packssdw mm2,mm6
- packssdw mm3,mm7
-
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
- add esi, byte 16*SIZEOF_FAST_FLOAT
- add edx, byte 16*SIZEOF_FAST_FLOAT
- add edi, byte 16*SIZEOF_JCOEF
- dec eax
- jnz short .quantloop
-
- emms ; empty MMX state
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
-; pop ebx ; unused
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jquantf-sse2-64.asm b/media/libjpeg/simd/jquantf-sse2-64.asm
deleted file mode 100644
index ef5c1f959e..0000000000
--- a/media/libjpeg/simd/jquantf-sse2-64.asm
+++ /dev/null
@@ -1,157 +0,0 @@
-;
-; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-; FAST_FLOAT *workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = FAST_FLOAT *workspace
-
- align 16
- global EXTN(jsimd_convsamp_float_sse2)
-
-EXTN(jsimd_convsamp_float_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- pcmpeqw xmm7,xmm7
- psllw xmm7,7
- packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
- mov rsi, r10
- mov eax, r11d
- mov rdi, r12
- mov rcx, DCTSIZE/2
-.convloop:
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
- movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
-
- psubb xmm0,xmm7 ; xmm0=(01234567)
- psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
-
- punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
- punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
-
- punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
- punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
- punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
- punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
-
- psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
- psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
- cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
- cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
- psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
- psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
- cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
- cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
-
- movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-
- add rsi, byte 2*SIZEOF_JSAMPROW
- add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
- dec rcx
- jnz short .convloop
-
- pop rbx
- uncollect_args
- pop rbp
- ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-; FAST_FLOAT *workspace);
-;
-
-; r10 = JCOEFPTR coef_block
-; r11 = FAST_FLOAT *divisors
-; r12 = FAST_FLOAT *workspace
-
- align 16
- global EXTN(jsimd_quantize_float_sse2)
-
-EXTN(jsimd_quantize_float_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov rsi, r12
- mov rdx, r11
- mov rdi, r10
- mov rax, DCTSIZE2/16
-.quantloop:
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
- mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
- mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
- cvtps2dq xmm0,xmm0
- cvtps2dq xmm1,xmm1
- cvtps2dq xmm2,xmm2
- cvtps2dq xmm3,xmm3
-
- packssdw xmm0,xmm1
- packssdw xmm2,xmm3
-
- movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
-
- add rsi, byte 16*SIZEOF_FAST_FLOAT
- add rdx, byte 16*SIZEOF_FAST_FLOAT
- add rdi, byte 16*SIZEOF_JCOEF
- dec rax
- jnz short .quantloop
-
- uncollect_args
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jquantf-sse2.asm b/media/libjpeg/simd/jquantf-sse2.asm
deleted file mode 100644
index 1cbc267400..0000000000
--- a/media/libjpeg/simd/jquantf-sse2.asm
+++ /dev/null
@@ -1,170 +0,0 @@
-;
-; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-; FAST_FLOAT *workspace);
-;
-
-%define sample_data ebp+8 ; JSAMPARRAY sample_data
-%define start_col ebp+12 ; JDIMENSION start_col
-%define workspace ebp+16 ; FAST_FLOAT *workspace
-
- align 16
- global EXTN(jsimd_convsamp_float_sse2)
-
-EXTN(jsimd_convsamp_float_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- pcmpeqw xmm7,xmm7
- psllw xmm7,7
- packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
- mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
- mov eax, JDIMENSION [start_col]
- mov edi, POINTER [workspace] ; (DCTELEM *)
- mov ecx, DCTSIZE/2
- alignx 16,7
-.convloop:
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
- movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
- psubb xmm0,xmm7 ; xmm0=(01234567)
- psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
-
- punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
- punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
-
- punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
- punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
- punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
- punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
-
- psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
- psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
- cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
- cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
- psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
- psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
- cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
- cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
-
- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-
- add esi, byte 2*SIZEOF_JSAMPROW
- add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
- dec ecx
- jnz short .convloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-; FAST_FLOAT *workspace);
-;
-
-%define coef_block ebp+8 ; JCOEFPTR coef_block
-%define divisors ebp+12 ; FAST_FLOAT *divisors
-%define workspace ebp+16 ; FAST_FLOAT *workspace
-
- align 16
- global EXTN(jsimd_quantize_float_sse2)
-
-EXTN(jsimd_quantize_float_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov esi, POINTER [workspace]
- mov edx, POINTER [divisors]
- mov edi, JCOEFPTR [coef_block]
- mov eax, DCTSIZE2/16
- alignx 16,7
-.quantloop:
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
- cvtps2dq xmm0,xmm0
- cvtps2dq xmm1,xmm1
- cvtps2dq xmm2,xmm2
- cvtps2dq xmm3,xmm3
-
- packssdw xmm0,xmm1
- packssdw xmm2,xmm3
-
- movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
-
- add esi, byte 16*SIZEOF_FAST_FLOAT
- add edx, byte 16*SIZEOF_FAST_FLOAT
- add edi, byte 16*SIZEOF_JCOEF
- dec eax
- jnz short .quantloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
-; pop ebx ; unused
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jquanti-sse2-64.asm b/media/libjpeg/simd/jquanti-sse2-64.asm
deleted file mode 100644
index 66c4e51907..0000000000
--- a/media/libjpeg/simd/jquanti-sse2-64.asm
+++ /dev/null
@@ -1,186 +0,0 @@
-;
-; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-; DCTELEM *workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = DCTELEM *workspace
-
- align 16
- global EXTN(jsimd_convsamp_sse2)
-
-EXTN(jsimd_convsamp_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- pxor xmm6,xmm6 ; xmm6=(all 0's)
- pcmpeqw xmm7,xmm7
- psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
- mov rsi, r10
- mov eax, r11d
- mov rdi, r12
- mov rcx, DCTSIZE/4
-.convloop:
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
- movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
-
- mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
- movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
-
- punpcklbw xmm0,xmm6 ; xmm0=(01234567)
- punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
- paddw xmm0,xmm7
- paddw xmm1,xmm7
- punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
- punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
- paddw xmm2,xmm7
- paddw xmm3,xmm7
-
- movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
- add rsi, byte 4*SIZEOF_JSAMPROW
- add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
- dec rcx
- jnz short .convloop
-
- pop rbx
- uncollect_args
- pop rbp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-; "How to optimize for the Pentium family of microprocessors"
-; (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
-; DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-; r10 = JCOEFPTR coef_block
-; r11 = DCTELEM *divisors
-; r12 = DCTELEM *workspace
-
- align 16
- global EXTN(jsimd_quantize_sse2)
-
-EXTN(jsimd_quantize_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov rsi, r12
- mov rdx, r11
- mov rdi, r10
- mov rax, DCTSIZE2/32
-.quantloop:
- movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
- movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
- movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
- movdqa xmm0,xmm4
- movdqa xmm1,xmm5
- movdqa xmm2,xmm6
- movdqa xmm3,xmm7
- psraw xmm4,(WORD_BIT-1)
- psraw xmm5,(WORD_BIT-1)
- psraw xmm6,(WORD_BIT-1)
- psraw xmm7,(WORD_BIT-1)
- pxor xmm0,xmm4
- pxor xmm1,xmm5
- pxor xmm2,xmm6
- pxor xmm3,xmm7
- psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
- psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
- psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
- psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
-
- paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
- paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
- paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
- paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
- pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
- pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
- pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
- pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
- pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
- pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
- pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
- pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
-
- pxor xmm0,xmm4
- pxor xmm1,xmm5
- pxor xmm2,xmm6
- pxor xmm3,xmm7
- psubw xmm0,xmm4
- psubw xmm1,xmm5
- psubw xmm2,xmm6
- psubw xmm3,xmm7
- movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
- add rsi, byte 32*SIZEOF_DCTELEM
- add rdx, byte 32*SIZEOF_DCTELEM
- add rdi, byte 32*SIZEOF_JCOEF
- dec rax
- jnz near .quantloop
-
- uncollect_args
- pop rbp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jquanti-sse2.asm b/media/libjpeg/simd/jquanti-sse2.asm
deleted file mode 100644
index aea8604e22..0000000000
--- a/media/libjpeg/simd/jquanti-sse2.asm
+++ /dev/null
@@ -1,199 +0,0 @@
-;
-; jquanti.asm - sample data conversion and quantization (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-; DCTELEM *workspace);
-;
-
-%define sample_data ebp+8 ; JSAMPARRAY sample_data
-%define start_col ebp+12 ; JDIMENSION start_col
-%define workspace ebp+16 ; DCTELEM *workspace
-
- align 16
- global EXTN(jsimd_convsamp_sse2)
-
-EXTN(jsimd_convsamp_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- pxor xmm6,xmm6 ; xmm6=(all 0's)
- pcmpeqw xmm7,xmm7
- psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
- mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
- mov eax, JDIMENSION [start_col]
- mov edi, POINTER [workspace] ; (DCTELEM *)
- mov ecx, DCTSIZE/4
- alignx 16,7
-.convloop:
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
- movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
-
- mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
- movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
-
- punpcklbw xmm0,xmm6 ; xmm0=(01234567)
- punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
- paddw xmm0,xmm7
- paddw xmm1,xmm7
- punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
- punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
- paddw xmm2,xmm7
- paddw xmm3,xmm7
-
- movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
- add esi, byte 4*SIZEOF_JSAMPROW
- add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
- dec ecx
- jnz short .convloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-; "How to optimize for the Pentium family of microprocessors"
-; (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
-; DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block ebp+8 ; JCOEFPTR coef_block
-%define divisors ebp+12 ; DCTELEM *divisors
-%define workspace ebp+16 ; DCTELEM *workspace
-
- align 16
- global EXTN(jsimd_quantize_sse2)
-
-EXTN(jsimd_quantize_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov esi, POINTER [workspace]
- mov edx, POINTER [divisors]
- mov edi, JCOEFPTR [coef_block]
- mov eax, DCTSIZE2/32
- alignx 16,7
-.quantloop:
- movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm0,xmm4
- movdqa xmm1,xmm5
- movdqa xmm2,xmm6
- movdqa xmm3,xmm7
- psraw xmm4,(WORD_BIT-1)
- psraw xmm5,(WORD_BIT-1)
- psraw xmm6,(WORD_BIT-1)
- psraw xmm7,(WORD_BIT-1)
- pxor xmm0,xmm4
- pxor xmm1,xmm5
- pxor xmm2,xmm6
- pxor xmm3,xmm7
- psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
- psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
- psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
- psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
-
- paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
- paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
- paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
- paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
- pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
- pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
- pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
- pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
- pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
- pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
- pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
- pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
-
- pxor xmm0,xmm4
- pxor xmm1,xmm5
- pxor xmm2,xmm6
- pxor xmm3,xmm7
- psubw xmm0,xmm4
- psubw xmm1,xmm5
- psubw xmm2,xmm6
- psubw xmm3,xmm7
- movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
- add esi, byte 32*SIZEOF_DCTELEM
- add edx, byte 32*SIZEOF_DCTELEM
- add edi, byte 32*SIZEOF_JCOEF
- dec eax
- jnz near .quantloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
-; pop ebx ; unused
- pop ebp
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jsimd.h b/media/libjpeg/simd/jsimd.h
index dc6ec430db..64747c6360 100644
--- a/media/libjpeg/simd/jsimd.h
+++ b/media/libjpeg/simd/jsimd.h
@@ -2,10 +2,12 @@
* simd/jsimd.h
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2016, D. R. Commander.
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* Copyright (C) 2014, Linaro Limited.
- * Copyright (C) 2015-2016, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,857 +17,1242 @@
/* Bitmask for supported acceleration methods */
-#define JSIMD_NONE 0x00
-#define JSIMD_MMX 0x01
-#define JSIMD_3DNOW 0x02
-#define JSIMD_SSE 0x04
-#define JSIMD_SSE2 0x08
-#define JSIMD_ARM_NEON 0x10
-#define JSIMD_MIPS_DSPR2 0x20
-#define JSIMD_ALTIVEC 0x40
+#define JSIMD_NONE 0x00
+#define JSIMD_MMX 0x01
+#define JSIMD_3DNOW 0x02
+#define JSIMD_SSE 0x04
+#define JSIMD_SSE2 0x08
+#define JSIMD_NEON 0x10
+#define JSIMD_DSPR2 0x20
+#define JSIMD_ALTIVEC 0x40
+#define JSIMD_AVX2 0x80
+#define JSIMD_MMI 0x100
/* SIMD Ext: retrieve SIMD/CPU information */
-EXTERN(unsigned int) jpeg_simd_cpu_support (void);
+EXTERN(unsigned int) jpeg_simd_cpu_support(void);
/* RGB & extended RGB --> YCC Colorspace Conversion */
EXTERN(void) jsimd_rgb_ycc_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgb_ycc_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgr_ycc_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
extern const int jconst_rgb_ycc_convert_sse2[];
EXTERN(void) jsimd_rgb_ycc_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgb_ycc_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgr_ycc_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_avx2[];
+EXTERN(void) jsimd_rgb_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_rgb_ycc_convert_neon
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgb_ycc_convert_neon
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgbx_ycc_convert_neon
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgr_ycc_convert_neon
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgrx_ycc_convert_neon
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxbgr_ycc_convert_neon
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxrgb_ycc_convert_neon
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+#ifndef NEON_INTRINSICS
EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-
-EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_rgb_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_rgb_ycc_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgb_ycc_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgr_ycc_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
/* RGB & extended RGB --> Grayscale Colorspace Conversion */
EXTERN(void) jsimd_rgb_gray_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgb_gray_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgbx_gray_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgr_gray_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgrx_gray_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxbgr_gray_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxrgb_gray_convert_mmx
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
extern const int jconst_rgb_gray_convert_sse2[];
EXTERN(void) jsimd_rgb_gray_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgb_gray_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgbx_gray_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgr_gray_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgrx_gray_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxbgr_gray_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxrgb_gray_convert_sse2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-
-EXTERN(void) jsimd_rgb_gray_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_gray_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgbx_gray_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_gray_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgrx_gray_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxbgr_gray_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_gray_convert_avx2[];
+EXTERN(void) jsimd_rgb_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_rgb_gray_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgb_gray_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extrgbx_gray_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgr_gray_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extbgrx_gray_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxbgr_gray_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
EXTERN(void) jsimd_extxrgb_gray_convert_altivec
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
/* YCC --> RGB & extended RGB Colorspace Conversion */
EXTERN(void) jsimd_ycc_rgb_convert_mmx
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgb_convert_mmx
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgr_convert_mmx
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
extern const int jconst_ycc_rgb_convert_sse2[];
EXTERN(void) jsimd_ycc_rgb_convert_sse2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgb_convert_sse2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgr_convert_sse2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_avx2[];
+EXTERN(void) jsimd_ycc_rgb_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_rgb_convert_neon
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgb_convert_neon
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgbx_convert_neon
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgr_convert_neon
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgrx_convert_neon
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxbgr_convert_neon
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxrgb_convert_neon
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_rgb565_convert_neon
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+#ifndef NEON_INTRINSICS
EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-
-EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_ycc_rgb_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_rgb_convert_altivec
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgb_convert_altivec
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgr_convert_altivec
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
/* NULL Colorspace Conversion */
-EXTERN(void) jsimd_c_null_convert_mips_dspr2
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows, int num_components);
+EXTERN(void) jsimd_c_null_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows, int num_components);
/* h2v1 Downsampling */
EXTERN(void) jsimd_h2v1_downsample_mmx
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
EXTERN(void) jsimd_h2v1_downsample_sse2
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_avx2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
EXTERN(void) jsimd_h2v1_downsample_neon
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
-EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v1_downsample_dspr2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
EXTERN(void) jsimd_h2v1_downsample_altivec
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
/* h2v2 Downsampling */
EXTERN(void) jsimd_h2v2_downsample_mmx
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
EXTERN(void) jsimd_h2v2_downsample_sse2
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_avx2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
EXTERN(void) jsimd_h2v2_downsample_neon
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_dspr2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
-EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_downsample_mmi
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
EXTERN(void) jsimd_h2v2_downsample_altivec
- (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data);
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
/* h2v2 Smooth Downsampling */
-EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2
- (JSAMPARRAY input_data, JSAMPARRAY output_data,
- JDIMENSION v_samp_factor, int max_v_samp_factor,
- int smoothing_factor, JDIMENSION width_blocks,
- JDIMENSION image_width);
+EXTERN(void) jsimd_h2v2_smooth_downsample_dspr2
+ (JSAMPARRAY input_data, JSAMPARRAY output_data, JDIMENSION v_samp_factor,
+ int max_v_samp_factor, int smoothing_factor, JDIMENSION width_in_blocks,
+ JDIMENSION image_width);
/* Upsampling */
EXTERN(void) jsimd_h2v1_upsample_mmx
- (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_upsample_mmx
- (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_upsample_sse2
- (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_upsample_sse2
- (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
-
-EXTERN(void) jsimd_h2v1_upsample_mips_dspr2
- (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v2_upsample_mips_dspr2
- (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
-
-EXTERN(void) jsimd_int_upsample_mips_dspr2
- (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
- int max_v_samp_factor);
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_avx2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_avx2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_neon
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_dspr2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_dspr2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_int_upsample_dspr2
+ (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
+ int max_v_samp_factor);
EXTERN(void) jsimd_h2v1_upsample_altivec
- (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_upsample_altivec
- (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
/* Fancy Upsampling */
EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
- (int max_v_samp_factor, JDIMENSION downsampled_width,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
- (int max_v_samp_factor, JDIMENSION downsampled_width,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
extern const int jconst_fancy_upsample_sse2[];
EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
- (int max_v_samp_factor, JDIMENSION downsampled_width,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
- (int max_v_samp_factor, JDIMENSION downsampled_width,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_fancy_upsample_neon
- (int max_v_samp_factor, JDIMENSION downsampled_width,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+extern const int jconst_fancy_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_avx2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2
- (int max_v_samp_factor, JDIMENSION downsampled_width,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2
- (int max_v_samp_factor, JDIMENSION downsampled_width,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample_neon
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
- (int max_v_samp_factor, JDIMENSION downsampled_width,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
- (int max_v_samp_factor, JDIMENSION downsampled_width,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
/* Merged Upsampling */
EXTERN(void) jsimd_h2v1_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
extern const int jconst_merged_upsample_sse2[];
EXTERN(void) jsimd_h2v1_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-
-EXTERN(void) jsimd_h2v1_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-
-EXTERN(void) jsimd_h2v2_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+extern const int jconst_merged_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
- (JDIMENSION output_width, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
/* Sample Conversion */
EXTERN(void) jsimd_convsamp_mmx
- (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
EXTERN(void) jsimd_convsamp_sse2
- (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_avx2
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
EXTERN(void) jsimd_convsamp_neon
- (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
-EXTERN(void) jsimd_convsamp_mips_dspr2
- (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_dspr2
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
EXTERN(void) jsimd_convsamp_altivec
- (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
/* Floating Point Sample Conversion */
EXTERN(void) jsimd_convsamp_float_3dnow
- (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
EXTERN(void) jsimd_convsamp_float_sse
- (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
EXTERN(void) jsimd_convsamp_float_sse2
- (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
-EXTERN(void) jsimd_convsamp_float_mips_dspr2
- (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_convsamp_float_dspr2
+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
-/* Slow Integer Forward DCT */
-EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM *data);
+/* Accurate Integer Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data);
extern const int jconst_fdct_islow_sse2[];
-EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_sse2(DCTELEM *data);
+
+extern const int jconst_fdct_islow_avx2[];
+EXTERN(void) jsimd_fdct_islow_avx2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_neon(DCTELEM *data);
-EXTERN(void) jsimd_fdct_islow_neon (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_dspr2(DCTELEM *data);
-EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_mmi(DCTELEM *data);
-EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_altivec(DCTELEM *data);
/* Fast Integer Forward DCT */
-EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_mmx(DCTELEM *data);
extern const int jconst_fdct_ifast_sse2[];
-EXTERN(void) jsimd_fdct_ifast_sse2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_sse2(DCTELEM *data);
-EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data);
-EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
-EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
/* Floating Point Forward DCT */
-EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_float_3dnow(FAST_FLOAT *data);
extern const int jconst_fdct_float_sse[];
-EXTERN(void) jsimd_fdct_float_sse (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_float_sse(FAST_FLOAT *data);
/* Quantization */
EXTERN(void) jsimd_quantize_mmx
- (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
EXTERN(void) jsimd_quantize_sse2
- (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_avx2
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
EXTERN(void) jsimd_quantize_neon
- (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_dspr2
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
-EXTERN(void) jsimd_quantize_mips_dspr2
- (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_mmi
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
EXTERN(void) jsimd_quantize_altivec
- (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
/* Floating Point Quantization */
EXTERN(void) jsimd_quantize_float_3dnow
- (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
EXTERN(void) jsimd_quantize_float_sse
- (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
EXTERN(void) jsimd_quantize_float_sse2
- (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
-EXTERN(void) jsimd_quantize_float_mips_dspr2
- (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_quantize_float_dspr2
+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
/* Scaled Inverse DCT */
EXTERN(void) jsimd_idct_2x2_mmx
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
EXTERN(void) jsimd_idct_4x4_mmx
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
extern const int jconst_idct_red_sse2[];
EXTERN(void) jsimd_idct_2x2_sse2
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
EXTERN(void) jsimd_idct_4x4_sse2
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
EXTERN(void) jsimd_idct_2x2_neon
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
EXTERN(void) jsimd_idct_4x4_neon
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
-
-EXTERN(void) jsimd_idct_2x2_mips_dspr2
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
-EXTERN(void) jsimd_idct_4x4_mips_dspr2
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col, int *workspace);
-EXTERN(void) jsimd_idct_6x6_mips_dspr2
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
-EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2
- (JCOEFPTR coef_block, void *dct_table, int *workspace);
-EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2
- (int *workspace, int *output);
-
-/* Slow Integer Inverse DCT */
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_2x2_dspr2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_dspr2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col, int *workspace);
+EXTERN(void) jsimd_idct_6x6_dspr2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12_pass1_dspr2
+ (JCOEFPTR coef_block, void *dct_table, int *workspace);
+EXTERN(void) jsimd_idct_12x12_pass2_dspr2
+ (int *workspace, int *output);
+
+/* Accurate Integer Inverse DCT */
EXTERN(void) jsimd_idct_islow_mmx
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
extern const int jconst_idct_islow_sse2[];
EXTERN(void) jsimd_idct_islow_sse2
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+extern const int jconst_idct_islow_avx2[];
+EXTERN(void) jsimd_idct_islow_avx2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
EXTERN(void) jsimd_idct_islow_neon
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_dspr2
+ (void *dct_table, JCOEFPTR coef_block, int *output_buf, JSAMPLE *output_col);
-EXTERN(void) jsimd_idct_islow_mips_dspr2
- (void *dct_table, JCOEFPTR coef_block, int *output_buf,
- JSAMPLE *output_col);
+EXTERN(void) jsimd_idct_islow_mmi
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
EXTERN(void) jsimd_idct_islow_altivec
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
/* Fast Integer Inverse DCT */
EXTERN(void) jsimd_idct_ifast_mmx
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
extern const int jconst_idct_ifast_sse2[];
EXTERN(void) jsimd_idct_ifast_sse2
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
EXTERN(void) jsimd_idct_ifast_neon
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_cols_dspr2
+ (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
+ const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_rows_dspr2
+ (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
+ const int *idct_coefs);
-EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2
- (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
- const int *idct_coefs);
-EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
- (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
- const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_mmi
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
EXTERN(void) jsimd_idct_ifast_altivec
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
/* Floating Point Inverse DCT */
EXTERN(void) jsimd_idct_float_3dnow
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
extern const int jconst_idct_float_sse[];
EXTERN(void) jsimd_idct_float_sse
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
extern const int jconst_idct_float_sse2[];
EXTERN(void) jsimd_idct_float_sse2
- (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col);
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
/* Huffman coding */
extern const int jconst_huff_encode_one_block[];
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2
- (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
- c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_sse2
+ (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
+ (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
+ (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#endif
+
+/* Progressive Huffman encoding */
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
- (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
- c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl
- (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
- c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
diff --git a/media/libjpeg/simd/jsimd_arm.c b/media/libjpeg/simd/jsimd_arm.c
deleted file mode 100644
index 61cd073e1b..0000000000
--- a/media/libjpeg/simd/jsimd_arm.c
+++ /dev/null
@@ -1,727 +0,0 @@
-/*
- * jsimd_arm.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 32-bit ARM architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature (char *buffer, char *feature)
-{
- char *p;
- if (*feature == 0)
- return 0;
- if (strncmp(buffer, "Features", 8) != 0)
- return 0;
- buffer += 8;
- while (isspace(*buffer))
- buffer++;
-
- /* Check if 'feature' is present in the buffer as a separate word */
- while ((p = strstr(buffer, feature))) {
- if (p > buffer && !isspace(*(p - 1))) {
- buffer++;
- continue;
- }
- p += strlen(feature);
- if (*p != 0 && !isspace(*p)) {
- buffer++;
- continue;
- }
- return 1;
- }
- return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
- char *buffer = (char *)malloc(bufsize);
- FILE *fd;
- simd_support = 0;
-
- if (!buffer)
- return 0;
-
- fd = fopen("/proc/cpuinfo", "r");
- if (fd) {
- while (fgets(buffer, bufsize, fd)) {
- if (!strchr(buffer, '\n') && !feof(fd)) {
- /* "impossible" happened - insufficient size of the buffer! */
- fclose(fd);
- free(buffer);
- return 0;
- }
- if (check_feature(buffer, "neon"))
- simd_support |= JSIMD_ARM_NEON;
- }
- fclose(fd);
- }
- free(buffer);
- return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
- char *env = NULL;
-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
- int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
- if (simd_support != ~0U)
- return;
-
- simd_support = 0;
-
-#if defined(__ARM_NEON__)
- simd_support |= JSIMD_ARM_NEON;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
- /* We still have a chance to use NEON regardless of globally used
- * -mcpu/-mfpu options passed to gcc by performing runtime detection via
- * /proc/cpuinfo parsing on linux/android */
- while (!parse_proc_cpuinfo(bufsize)) {
- bufsize *= 2;
- if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
- break;
- }
-#endif
-
- /* Force different settings through environment variables */
- env = getenv("JSIMD_FORCENEON");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = JSIMD_ARM_NEON;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = 0;
- env = getenv("JSIMD_NOHUFFENC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- neonfct=jsimd_extrgb_ycc_convert_neon;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- neonfct=jsimd_extrgbx_ycc_convert_neon;
- break;
- case JCS_EXT_BGR:
- neonfct=jsimd_extbgr_ycc_convert_neon;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- neonfct=jsimd_extbgrx_ycc_convert_neon;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- neonfct=jsimd_extxbgr_ycc_convert_neon;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- neonfct=jsimd_extxrgb_ycc_convert_neon;
- break;
- default:
- neonfct=jsimd_extrgb_ycc_convert_neon;
- break;
- }
-
- neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
- void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- neonfct=jsimd_ycc_extrgb_convert_neon;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- neonfct=jsimd_ycc_extrgbx_convert_neon;
- break;
- case JCS_EXT_BGR:
- neonfct=jsimd_ycc_extbgr_convert_neon;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- neonfct=jsimd_ycc_extbgrx_convert_neon;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- neonfct=jsimd_ycc_extxbgr_convert_neon;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- neonfct=jsimd_ycc_extxrgb_convert_neon;
- break;
- default:
- neonfct=jsimd_ycc_extrgb_convert_neon;
- break;
- }
-
- neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
- jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
- output_buf, num_rows);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
- jsimd_convsamp_neon(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
- jsimd_fdct_ifast_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
-{
- jsimd_quantize_neon(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(IFAST_MULT_TYPE) != 2)
- return 0;
- if (IFAST_SCALE_BITS != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
- init_simd();
-
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON && simd_huffman)
- return 1;
-
- return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
-{
- return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
- dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimd_arm64.c b/media/libjpeg/simd/jsimd_arm64.c
deleted file mode 100644
index 09449bb6fd..0000000000
--- a/media/libjpeg/simd/jsimd_arm64.c
+++ /dev/null
@@ -1,802 +0,0 @@
-/*
- * jsimd_arm64.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 64-bit ARM architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-#define JSIMD_FASTLD3 1
-#define JSIMD_FASTST3 2
-#define JSIMD_FASTTBL 4
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
- JSIMD_FASTTBL;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_cpuinfo (char *buffer, const char *field, char *value)
-{
- char *p;
- if (*value == 0)
- return 0;
- if (strncmp(buffer, field, strlen(field)) != 0)
- return 0;
- buffer += strlen(field);
- while (isspace(*buffer))
- buffer++;
-
- /* Check if 'value' is present in the buffer as a separate word */
- while ((p = strstr(buffer, value))) {
- if (p > buffer && !isspace(*(p - 1))) {
- buffer++;
- continue;
- }
- p += strlen(value);
- if (*p != 0 && !isspace(*p)) {
- buffer++;
- continue;
- }
- return 1;
- }
- return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
- char *buffer = (char *)malloc(bufsize);
- FILE *fd;
-
- if (!buffer)
- return 0;
-
- fd = fopen("/proc/cpuinfo", "r");
- if (fd) {
- while (fgets(buffer, bufsize, fd)) {
- if (!strchr(buffer, '\n') && !feof(fd)) {
- /* "impossible" happened - insufficient size of the buffer! */
- fclose(fd);
- free(buffer);
- return 0;
- }
- if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
- check_cpuinfo(buffer, "CPU part", "0xd07"))
- /* The Cortex-A53 has a slow tbl implementation. We can gain a few
- percent speedup by disabling the use of that instruction. The
- speedup on Cortex-A57 is more subtle but still measurable. */
- simd_features &= ~JSIMD_FASTTBL;
- else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
- /* The SIMD version of Huffman encoding is slower than the C version on
- Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that
- CPU. */
- simd_huffman = simd_features = 0;
- }
- fclose(fd);
- }
- free(buffer);
- return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-
-/*
- * ARMv8 architectures support NEON extensions by default.
- * It is no longer optional as it was with ARMv7.
- */
-
-
-LOCAL(void)
-init_simd (void)
-{
- char *env = NULL;
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
- int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
- if (simd_support != ~0U)
- return;
-
- simd_support = 0;
-
- simd_support |= JSIMD_ARM_NEON;
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
- while (!parse_proc_cpuinfo(bufsize)) {
- bufsize *= 2;
- if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
- break;
- }
-#endif
-
- /* Force different settings through environment variables */
- env = getenv("JSIMD_FORCENEON");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = JSIMD_ARM_NEON;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = 0;
- env = getenv("JSIMD_NOHUFFENC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_huffman = 0;
- env = getenv("JSIMD_FASTLD3");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_features |= JSIMD_FASTLD3;
- if ((env != NULL) && (strcmp(env, "0") == 0))
- simd_features &= ~JSIMD_FASTLD3;
- env = getenv("JSIMD_FASTST3");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_features |= JSIMD_FASTST3;
- if ((env != NULL) && (strcmp(env, "0") == 0))
- simd_features &= ~JSIMD_FASTST3;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- if (simd_features & JSIMD_FASTLD3)
- neonfct=jsimd_extrgb_ycc_convert_neon;
- else
- neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- neonfct=jsimd_extrgbx_ycc_convert_neon;
- break;
- case JCS_EXT_BGR:
- if (simd_features & JSIMD_FASTLD3)
- neonfct=jsimd_extbgr_ycc_convert_neon;
- else
- neonfct=jsimd_extbgr_ycc_convert_neon_slowld3;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- neonfct=jsimd_extbgrx_ycc_convert_neon;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- neonfct=jsimd_extxbgr_ycc_convert_neon;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- neonfct=jsimd_extxrgb_ycc_convert_neon;
- break;
- default:
- if (simd_features & JSIMD_FASTLD3)
- neonfct=jsimd_extrgb_ycc_convert_neon;
- else
- neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
- break;
- }
-
- neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
- void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- if (simd_features & JSIMD_FASTST3)
- neonfct=jsimd_ycc_extrgb_convert_neon;
- else
- neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- neonfct=jsimd_ycc_extrgbx_convert_neon;
- break;
- case JCS_EXT_BGR:
- if (simd_features & JSIMD_FASTST3)
- neonfct=jsimd_ycc_extbgr_convert_neon;
- else
- neonfct=jsimd_ycc_extbgr_convert_neon_slowst3;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- neonfct=jsimd_ycc_extbgrx_convert_neon;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- neonfct=jsimd_ycc_extxbgr_convert_neon;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- neonfct=jsimd_ycc_extxrgb_convert_neon;
- break;
- default:
- if (simd_features & JSIMD_FASTST3)
- neonfct=jsimd_ycc_extrgb_convert_neon;
- else
- neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
- break;
- }
-
- neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
- jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
- output_buf, num_rows);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor, compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor, compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
- jsimd_convsamp_neon(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
- jsimd_fdct_islow_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
- jsimd_fdct_ifast_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
-{
- jsimd_quantize_neon(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(IFAST_MULT_TYPE) != 2)
- return 0;
- if (IFAST_SCALE_BITS != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
- init_simd();
-
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
-
- if (simd_support & JSIMD_ARM_NEON && simd_huffman)
- return 1;
-
- return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
-{
- if (simd_features & JSIMD_FASTTBL)
- return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
- dctbl, actbl);
- else
- return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
- last_dc_val, dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimd_arm_neon.S b/media/libjpeg/simd/jsimd_arm_neon.S
deleted file mode 100644
index cd2612724a..0000000000
--- a/media/libjpeg/simd/jsimd_arm_neon.S
+++ /dev/null
@@ -1,2878 +0,0 @@
-/*
- * ARMv7 NEON optimizations for libjpeg-turbo
- *
- * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
- * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
-#endif
-
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.arm
-.syntax unified
-
-
-#define RESPECT_STRICT_ALIGNMENT 1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
- .globl _\fname
-_\fname:
-#else
- .global \fname
-#ifdef __ELF__
- .hidden \fname
- .type \fname, %function
-#endif
-\fname:
-#endif
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4 x0, x1, x2, x3
- vtrn.16 \x0, \x1
- vtrn.16 \x2, \x3
- vtrn.32 \x0, \x2
- vtrn.32 \x1, \x3
-.endm
-
-
-#define CENTERJSAMPLE 128
-
-/*****************************************************************************/
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
- * JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define FIX_0_298631336 (2446)
-#define FIX_0_390180644 (3196)
-#define FIX_0_541196100 (4433)
-#define FIX_0_765366865 (6270)
-#define FIX_0_899976223 (7373)
-#define FIX_1_175875602 (9633)
-#define FIX_1_501321110 (12299)
-#define FIX_1_847759065 (15137)
-#define FIX_1_961570560 (16069)
-#define FIX_2_053119869 (16819)
-#define FIX_2_562915447 (20995)
-#define FIX_3_072711026 (25172)
-
-#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
-
-/*
- * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
- * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
- */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
-{ \
- DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
- JLONG q1, q2, q3, q4, q5, q6, q7; \
- JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
- \
- /* 1-D iDCT input data */ \
- row0 = xrow0; \
- row1 = xrow1; \
- row2 = xrow2; \
- row3 = xrow3; \
- row4 = xrow4; \
- row5 = xrow5; \
- row6 = xrow6; \
- row7 = xrow7; \
- \
- q5 = row7 + row3; \
- q4 = row5 + row1; \
- q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
- MULTIPLY(q4, FIX_1_175875602); \
- q7 = MULTIPLY(q5, FIX_1_175875602) + \
- MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
- q2 = MULTIPLY(row2, FIX_0_541196100) + \
- MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
- q4 = q6; \
- q3 = ((JLONG) row0 - (JLONG) row4) << 13; \
- q6 += MULTIPLY(row5, -FIX_2_562915447) + \
- MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
- /* now we can use q1 (reloadable constants have been used up) */ \
- q1 = q3 + q2; \
- q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
- MULTIPLY(row1, -FIX_0_899976223); \
- q5 = q7; \
- q1 = q1 + q6; \
- q7 += MULTIPLY(row7, -FIX_0_899976223) + \
- MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
- \
- /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
- tmp11_plus_tmp2 = q1; \
- row1 = 0; \
- \
- q1 = q1 - q6; \
- q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
- MULTIPLY(row3, -FIX_2_562915447); \
- q1 = q1 - q6; \
- q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
- MULTIPLY(row6, FIX_0_541196100); \
- q3 = q3 - q2; \
- \
- /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
- tmp11_minus_tmp2 = q1; \
- \
- q1 = ((JLONG) row0 + (JLONG) row4) << 13; \
- q2 = q1 + q6; \
- q1 = q1 - q6; \
- \
- /* pick up the results */ \
- tmp0 = q4; \
- tmp1 = q5; \
- tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
- tmp3 = q7; \
- tmp10 = q2; \
- tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
- tmp12 = q3; \
- tmp13 = q1; \
-}
-
-#define XFIX_0_899976223 d0[0]
-#define XFIX_0_541196100 d0[1]
-#define XFIX_2_562915447 d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
-#define XFIX_1_175875602 d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
-
-.balign 16
-jsimd_idct_islow_neon_consts:
- .short FIX_0_899976223 /* d0[0] */
- .short FIX_0_541196100 /* d0[1] */
- .short FIX_2_562915447 /* d0[2] */
- .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
- .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
- .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
- .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
- .short FIX_1_175875602 /* d1[3] */
- /* reloadable constants */
- .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
- .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
- .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
- .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
-
-asm_function jsimd_idct_islow_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req r1
- TMP3 .req r2
- TMP4 .req ip
-
- ROW0L .req d16
- ROW0R .req d17
- ROW1L .req d18
- ROW1R .req d19
- ROW2L .req d20
- ROW2R .req d21
- ROW3L .req d22
- ROW3R .req d23
- ROW4L .req d24
- ROW4R .req d25
- ROW5L .req d26
- ROW5R .req d27
- ROW6L .req d28
- ROW6R .req d29
- ROW7L .req d30
- ROW7R .req d31
-
- /* Load and dequantize coefficients into NEON registers
- * with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 ( q8 )
- * 1 | d18 | d19 ( q9 )
- * 2 | d20 | d21 ( q10 )
- * 3 | d22 | d23 ( q11 )
- * 4 | d24 | d25 ( q12 )
- * 5 | d26 | d27 ( q13 )
- * 6 | d28 | d29 ( q14 )
- * 7 | d30 | d31 ( q15 )
- */
- adr ip, jsimd_idct_islow_neon_consts
- vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
- vmul.s16 q8, q8, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q9, q9, q1
- vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
- vmul.s16 q10, q10, q2
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vmul.s16 q11, q11, q3
- vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
- vmul.s16 q12, q12, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q14, q14, q2
- vmul.s16 q13, q13, q1
- vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
- add ip, ip, #16
- vmul.s16 q15, q15, q3
- vpush {d8-d15} /* save NEON registers */
- /* 1-D IDCT, pass 1, left 4x8 half */
- vadd.s16 d4, ROW7L, ROW3L
- vadd.s16 d5, ROW5L, ROW1L
- vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, d5, XFIX_1_175875602
- vmull.s16 q7, d4, XFIX_1_175875602
- /* Check for the zero coefficients in the right 4x8 half */
- push {r4, r5}
- vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
- vsubl.s16 q3, ROW0L, ROW4L
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
- vmull.s16 q2, ROW2L, XFIX_0_541196100
- vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
- orr r0, r4, r5
- vmov q4, q6
- vmlsl.s16 q6, ROW5L, XFIX_2_562915447
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
- vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
- vshl.s32 q3, q3, #13
- orr r0, r0, r4
- vmlsl.s16 q4, ROW1L, XFIX_0_899976223
- orr r0, r0, r5
- vadd.s32 q1, q3, q2
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
- vmov q5, q7
- vadd.s32 q1, q1, q6
- orr r0, r0, r4
- vmlsl.s16 q7, ROW7L, XFIX_0_899976223
- orr r0, r0, r5
- vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
- vrshrn.s32 ROW1L, q1, #11
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
- orr r0, r0, r4
- vmlsl.s16 q5, ROW3L, XFIX_2_562915447
- orr r0, r0, r5
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
- vmlal.s16 q6, ROW6L, XFIX_0_541196100
- vsub.s32 q3, q3, q2
- orr r0, r0, r4
- vrshrn.s32 ROW6L, q1, #11
- orr r0, r0, r5
- vadd.s32 q1, q3, q5
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0L, ROW4L
- orr r0, r0, r4
- vrshrn.s32 ROW2L, q1, #11
- orr r0, r0, r5
- vrshrn.s32 ROW5L, q3, #11
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
- orr r0, r0, r4
- vadd.s32 q2, q5, q6
- orrs r0, r0, r5
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- orr r0, r4, r5
- vsub.s32 q3, q1, q4
- pop {r4, r5}
- vrshrn.s32 ROW7L, q2, #11
- vrshrn.s32 ROW3L, q5, #11
- vrshrn.s32 ROW0L, q6, #11
- vrshrn.s32 ROW4L, q3, #11
-
- beq 3f /* Go to do some special handling for the sparse
- right 4x8 half */
-
- /* 1-D IDCT, pass 1, right 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vadd.s16 d10, ROW7R, ROW3R
- vadd.s16 d8, ROW5R, ROW1R
- /* Transpose left 4x8 half */
- vtrn.16 ROW6L, ROW7L
- vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, d8, XFIX_1_175875602
- vtrn.16 ROW2L, ROW3L
- vmull.s16 q7, d10, XFIX_1_175875602
- vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
- vtrn.16 ROW0L, ROW1L
- vsubl.s16 q3, ROW0R, ROW4R
- vmull.s16 q2, ROW2R, XFIX_0_541196100
- vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
- vtrn.16 ROW4L, ROW5L
- vmov q4, q6
- vmlsl.s16 q6, ROW5R, XFIX_2_562915447
- vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
- vtrn.32 ROW1L, ROW3L
- vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW1R, XFIX_0_899976223
- vtrn.32 ROW4L, ROW6L
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vadd.s32 q1, q1, q6
- vtrn.32 ROW0L, ROW2L
- vmlsl.s16 q7, ROW7R, XFIX_0_899976223
- vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
- vrshrn.s32 ROW1R, q1, #11
- vtrn.32 ROW5L, ROW7L
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
- vmlsl.s16 q5, ROW3R, XFIX_2_562915447
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
- vmlal.s16 q6, ROW6R, XFIX_0_541196100
- vsub.s32 q3, q3, q2
- vrshrn.s32 ROW6R, q1, #11
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0R, ROW4R
- vrshrn.s32 ROW2R, q1, #11
- vrshrn.s32 ROW5R, q3, #11
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vrshrn.s32 ROW7R, q2, #11
- vrshrn.s32 ROW3R, q5, #11
- vrshrn.s32 ROW0R, q6, #11
- vrshrn.s32 ROW4R, q3, #11
- /* Transpose right 4x8 half */
- vtrn.16 ROW6R, ROW7R
- vtrn.16 ROW2R, ROW3R
- vtrn.16 ROW0R, ROW1R
- vtrn.16 ROW4R, ROW5R
- vtrn.32 ROW1R, ROW3R
- vtrn.32 ROW4R, ROW6R
- vtrn.32 ROW0R, ROW2R
- vtrn.32 ROW5R, ROW7R
-
-1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
- vmlal.s16 q6, ROW1L, XFIX_1_175875602
- vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
- vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
- vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
- vmlal.s16 q7, ROW3L, XFIX_1_175875602
- vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
- vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
- vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
- vmull.s16 q2, ROW2L, XFIX_0_541196100
- vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
- vmov q4, q6
- vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
- vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
- vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW1L, XFIX_0_899976223
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vadd.s32 q1, q1, q6
- vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
- vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
- vshrn.s32 ROW1L, q1, #16
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
- vmlsl.s16 q5, ROW3L, XFIX_2_562915447
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
- vshrn.s32 ROW2L, q1, #16
- vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW3L, q5, #16
- vshrn.s32 ROW0L, q6, #16
- vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
- /* 1-D IDCT, pass 2, right 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW5R, XFIX_1_175875602
- vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
- vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
- vmull.s16 q7, ROW7R, XFIX_1_175875602
- vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
- vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
- vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
- vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
- vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
- vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
- vmov q4, q6
- vmlsl.s16 q6, ROW5R, XFIX_2_562915447
- vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
- vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vadd.s32 q1, q1, q6
- vmlsl.s16 q7, ROW7R, XFIX_0_899976223
- vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
- vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
- vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
- vmlal.s16 q6, ROW6R, XFIX_0_541196100
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW6R, q1, #16
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
- vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
- vshrn.s32 ROW5R, q3, #16
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW7R, q2, #16
- vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
- vshrn.s32 ROW4R, q3, #16
-
-2: /* Descale to 8-bit and range limit */
- vqrshrn.s16 d16, q8, #2
- vqrshrn.s16 d17, q9, #2
- vqrshrn.s16 d18, q10, #2
- vqrshrn.s16 d19, q11, #2
- vpop {d8-d15} /* restore NEON registers */
- vqrshrn.s16 d20, q12, #2
- /* Transpose the final 8-bit samples and do signed->unsigned conversion */
- vtrn.16 q8, q9
- vqrshrn.s16 d21, q13, #2
- vqrshrn.s16 d22, q14, #2
- vmov.u8 q0, #(CENTERJSAMPLE)
- vqrshrn.s16 d23, q15, #2
- vtrn.8 d16, d17
- vtrn.8 d18, d19
- vadd.u8 q8, q8, q0
- vadd.u8 q9, q9, q0
- vtrn.16 q10, q11
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d16}, [TMP1]
- vtrn.8 d20, d21
- vst1.8 {d17}, [TMP2]
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d18}, [TMP1]
- vadd.u8 q10, q10, q0
- vst1.8 {d19}, [TMP2]
- ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
- vtrn.8 d22, d23
- vst1.8 {d20}, [TMP1]
- vadd.u8 q11, q11, q0
- vst1.8 {d21}, [TMP2]
- vst1.8 {d22}, [TMP3]
- vst1.8 {d23}, [TMP4]
- bx lr
-
-3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
-
- /* Transpose left 4x8 half */
- vtrn.16 ROW6L, ROW7L
- vtrn.16 ROW2L, ROW3L
- vtrn.16 ROW0L, ROW1L
- vtrn.16 ROW4L, ROW5L
- vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
- vtrn.32 ROW1L, ROW3L
- vtrn.32 ROW4L, ROW6L
- vtrn.32 ROW0L, ROW2L
- vtrn.32 ROW5L, ROW7L
-
- cmp r0, #0
- beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
- pass */
-
- /* Only row 0 is non-zero for the right 4x8 half */
- vdup.s16 ROW1R, ROW0R[1]
- vdup.s16 ROW2R, ROW0R[2]
- vdup.s16 ROW3R, ROW0R[3]
- vdup.s16 ROW4R, ROW0R[0]
- vdup.s16 ROW5R, ROW0R[1]
- vdup.s16 ROW6R, ROW0R[2]
- vdup.s16 ROW7R, ROW0R[3]
- vdup.s16 ROW0R, ROW0R[0]
- b 1b /* Go to 'normal' second pass */
-
-4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW1L, XFIX_1_175875602
- vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
- vmull.s16 q7, ROW3L, XFIX_1_175875602
- vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
- vmull.s16 q2, ROW2L, XFIX_0_541196100
- vshll.s16 q3, ROW0L, #13
- vmov q4, q6
- vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
- vmlsl.s16 q4, ROW1L, XFIX_0_899976223
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
- vadd.s32 q1, q1, q6
- vadd.s32 q6, q6, q6
- vmlsl.s16 q5, ROW3L, XFIX_2_562915447
- vshrn.s32 ROW1L, q1, #16
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vshll.s16 q5, ROW0L, #13
- vshrn.s32 ROW2L, q1, #16
- vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW3L, q5, #16
- vshrn.s32 ROW0L, q6, #16
- vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
- /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW5L, XFIX_1_175875602
- vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
- vmull.s16 q7, ROW7L, XFIX_1_175875602
- vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
- vmull.s16 q2, ROW6L, XFIX_0_541196100
- vshll.s16 q3, ROW4L, #13
- vmov q4, q6
- vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
- vmlsl.s16 q4, ROW5L, XFIX_0_899976223
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
- vadd.s32 q1, q1, q6
- vadd.s32 q6, q6, q6
- vmlsl.s16 q5, ROW7L, XFIX_2_562915447
- vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW6R, q1, #16
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vshll.s16 q5, ROW4L, #13
- vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
- vshrn.s32 ROW5R, q3, #16
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW7R, q2, #16
- vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
- vshrn.s32 ROW4R, q3, #16
- b 2b /* Go to epilogue */
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
- .unreq ROW0L
- .unreq ROW0R
- .unreq ROW1L
- .unreq ROW1R
- .unreq ROW2L
- .unreq ROW2R
- .unreq ROW3L
- .unreq ROW3R
- .unreq ROW4L
- .unreq ROW4R
- .unreq ROW5L
- .unreq ROW5R
- .unreq ROW6L
- .unreq ROW6R
- .unreq ROW7L
- .unreq ROW7R
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in ARM NEON case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200 d0[0]
-#define XFIX_1_414213562 d0[1]
-#define XFIX_1_847759065 d0[2]
-#define XFIX_2_613125930 d0[3]
-
-.balign 16
-jsimd_idct_ifast_neon_consts:
- .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
- .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
- .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
- .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
-
-asm_function jsimd_idct_ifast_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req r1
- TMP3 .req r2
- TMP4 .req ip
-
- /* Load and dequantize coefficients into NEON registers
- * with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 ( q8 )
- * 1 | d18 | d19 ( q9 )
- * 2 | d20 | d21 ( q10 )
- * 3 | d22 | d23 ( q11 )
- * 4 | d24 | d25 ( q12 )
- * 5 | d26 | d27 ( q13 )
- * 6 | d28 | d29 ( q14 )
- * 7 | d30 | d31 ( q15 )
- */
- adr ip, jsimd_idct_ifast_neon_consts
- vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
- vmul.s16 q8, q8, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q9, q9, q1
- vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
- vmul.s16 q10, q10, q2
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vmul.s16 q11, q11, q3
- vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
- vmul.s16 q12, q12, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q14, q14, q2
- vmul.s16 q13, q13, q1
- vld1.16 {d0}, [ip, :64] /* load constants */
- vmul.s16 q15, q15, q3
- vpush {d8-d13} /* save NEON registers */
- /* 1-D IDCT, pass 1 */
- vsub.s16 q2, q10, q14
- vadd.s16 q14, q10, q14
- vsub.s16 q1, q11, q13
- vadd.s16 q13, q11, q13
- vsub.s16 q5, q9, q15
- vadd.s16 q15, q9, q15
- vqdmulh.s16 q4, q2, XFIX_1_414213562
- vqdmulh.s16 q6, q1, XFIX_2_613125930
- vadd.s16 q3, q1, q1
- vsub.s16 q1, q5, q1
- vadd.s16 q10, q2, q4
- vqdmulh.s16 q4, q1, XFIX_1_847759065
- vsub.s16 q2, q15, q13
- vadd.s16 q3, q3, q6
- vqdmulh.s16 q6, q2, XFIX_1_414213562
- vadd.s16 q1, q1, q4
- vqdmulh.s16 q4, q5, XFIX_1_082392200
- vsub.s16 q10, q10, q14
- vadd.s16 q2, q2, q6
- vsub.s16 q6, q8, q12
- vadd.s16 q12, q8, q12
- vadd.s16 q9, q5, q4
- vadd.s16 q5, q6, q10
- vsub.s16 q10, q6, q10
- vadd.s16 q6, q15, q13
- vadd.s16 q8, q12, q14
- vsub.s16 q3, q6, q3
- vsub.s16 q12, q12, q14
- vsub.s16 q3, q3, q1
- vsub.s16 q1, q9, q1
- vadd.s16 q2, q3, q2
- vsub.s16 q15, q8, q6
- vadd.s16 q1, q1, q2
- vadd.s16 q8, q8, q6
- vadd.s16 q14, q5, q3
- vsub.s16 q9, q5, q3
- vsub.s16 q13, q10, q2
- vadd.s16 q10, q10, q2
- /* Transpose */
- vtrn.16 q8, q9
- vsub.s16 q11, q12, q1
- vtrn.16 q14, q15
- vadd.s16 q12, q12, q1
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q8, q10
- vtrn.32 q13, q15
- vswp d28, d21
- vswp d26, d19
- /* 1-D IDCT, pass 2 */
- vsub.s16 q2, q10, q14
- vswp d30, d23
- vadd.s16 q14, q10, q14
- vswp d24, d17
- vsub.s16 q1, q11, q13
- vadd.s16 q13, q11, q13
- vsub.s16 q5, q9, q15
- vadd.s16 q15, q9, q15
- vqdmulh.s16 q4, q2, XFIX_1_414213562
- vqdmulh.s16 q6, q1, XFIX_2_613125930
- vadd.s16 q3, q1, q1
- vsub.s16 q1, q5, q1
- vadd.s16 q10, q2, q4
- vqdmulh.s16 q4, q1, XFIX_1_847759065
- vsub.s16 q2, q15, q13
- vadd.s16 q3, q3, q6
- vqdmulh.s16 q6, q2, XFIX_1_414213562
- vadd.s16 q1, q1, q4
- vqdmulh.s16 q4, q5, XFIX_1_082392200
- vsub.s16 q10, q10, q14
- vadd.s16 q2, q2, q6
- vsub.s16 q6, q8, q12
- vadd.s16 q12, q8, q12
- vadd.s16 q9, q5, q4
- vadd.s16 q5, q6, q10
- vsub.s16 q10, q6, q10
- vadd.s16 q6, q15, q13
- vadd.s16 q8, q12, q14
- vsub.s16 q3, q6, q3
- vsub.s16 q12, q12, q14
- vsub.s16 q3, q3, q1
- vsub.s16 q1, q9, q1
- vadd.s16 q2, q3, q2
- vsub.s16 q15, q8, q6
- vadd.s16 q1, q1, q2
- vadd.s16 q8, q8, q6
- vadd.s16 q14, q5, q3
- vsub.s16 q9, q5, q3
- vsub.s16 q13, q10, q2
- vpop {d8-d13} /* restore NEON registers */
- vadd.s16 q10, q10, q2
- vsub.s16 q11, q12, q1
- vadd.s16 q12, q12, q1
- /* Descale to 8-bit and range limit */
- vmov.u8 q0, #0x80
- vqshrn.s16 d16, q8, #5
- vqshrn.s16 d17, q9, #5
- vqshrn.s16 d18, q10, #5
- vqshrn.s16 d19, q11, #5
- vqshrn.s16 d20, q12, #5
- vqshrn.s16 d21, q13, #5
- vqshrn.s16 d22, q14, #5
- vqshrn.s16 d23, q15, #5
- vadd.u8 q8, q8, q0
- vadd.u8 q9, q9, q0
- vadd.u8 q10, q10, q0
- vadd.u8 q11, q11, q0
- /* Transpose the final 8-bit samples */
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.8 d16, d17
- vtrn.8 d18, d19
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d16}, [TMP1]
- vst1.8 {d17}, [TMP2]
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d18}, [TMP1]
- vtrn.8 d20, d21
- vst1.8 {d19}, [TMP2]
- ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
- vst1.8 {d20}, [TMP1]
- vtrn.8 d22, d23
- vst1.8 {d21}, [TMP2]
- vst1.8 {d22}, [TMP3]
- vst1.8 {d23}, [TMP4]
- bx lr
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- * idct_helper/transpose_4x4 macros and reordering instructions,
- * but readability will suffer somewhat.
- */
-
-#define CONST_BITS 13
-
-#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
-#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
-#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
-#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
-#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
-#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
-#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
-#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
-#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
-
-.balign 16
-jsimd_idct_4x4_neon_consts:
- .short FIX_1_847759065 /* d0[0] */
- .short -FIX_0_765366865 /* d0[1] */
- .short -FIX_0_211164243 /* d0[2] */
- .short FIX_1_451774981 /* d0[3] */
- .short -FIX_2_172734803 /* d1[0] */
- .short FIX_1_061594337 /* d1[1] */
- .short -FIX_0_509795579 /* d1[2] */
- .short -FIX_0_601344887 /* d1[3] */
- .short FIX_0_899976223 /* d2[0] */
- .short FIX_2_562915447 /* d2[1] */
- .short 1 << (CONST_BITS+1) /* d2[2] */
- .short 0 /* d2[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
- vmull.s16 q14, \x4, d2[2]
- vmlal.s16 q14, \x8, d0[0]
- vmlal.s16 q14, \x14, d0[1]
-
- vmull.s16 q13, \x16, d1[2]
- vmlal.s16 q13, \x12, d1[3]
- vmlal.s16 q13, \x10, d2[0]
- vmlal.s16 q13, \x6, d2[1]
-
- vmull.s16 q15, \x4, d2[2]
- vmlsl.s16 q15, \x8, d0[0]
- vmlsl.s16 q15, \x14, d0[1]
-
- vmull.s16 q12, \x16, d0[2]
- vmlal.s16 q12, \x12, d0[3]
- vmlal.s16 q12, \x10, d1[0]
- vmlal.s16 q12, \x6, d1[1]
-
- vadd.s32 q10, q14, q13
- vsub.s32 q14, q14, q13
-
- .if \shift > 16
- vrshr.s32 q10, q10, #\shift
- vrshr.s32 q14, q14, #\shift
- vmovn.s32 \y26, q10
- vmovn.s32 \y29, q14
- .else
- vrshrn.s32 \y26, q10, #\shift
- vrshrn.s32 \y29, q14, #\shift
- .endif
-
- vadd.s32 q10, q15, q12
- vsub.s32 q15, q15, q12
-
- .if \shift > 16
- vrshr.s32 q10, q10, #\shift
- vrshr.s32 q15, q15, #\shift
- vmovn.s32 \y27, q10
- vmovn.s32 \y28, q15
- .else
- vrshrn.s32 \y27, q10, #\shift
- vrshrn.s32 \y28, q15, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req r1
- TMP3 .req r2
- TMP4 .req ip
-
- vpush {d8-d15}
-
- /* Load constants (d3 is just used for padding) */
- adr TMP4, jsimd_idct_4x4_neon_consts
- vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d4 | d5
- * 1 | d6 | d7
- * 2 | d8 | d9
- * 3 | d10 | d11
- * 4 | - | -
- * 5 | d12 | d13
- * 6 | d14 | d15
- * 7 | d16 | d17
- */
- vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
- vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
- vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
- /* dequantize */
- vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
- vmul.s16 q2, q2, q9
- vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
- vmul.s16 q3, q3, q10
- vmul.s16 q4, q4, q11
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
- vmul.s16 q5, q5, q12
- vmul.s16 q6, q6, q13
- vld1.16 {d30, d31}, [DCT_TABLE, :128]!
- vmul.s16 q7, q7, q14
- vmul.s16 q8, q8, q15
-
- /* Pass 1 */
- idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
- transpose_4x4 d4, d6, d8, d10
- idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
- transpose_4x4 d5, d7, d9, d11
-
- /* Pass 2 */
- idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
- transpose_4x4 d26, d27, d28, d29
-
- /* Range limit */
- vmov.u16 q15, #0x80
- vadd.s16 q13, q13, q15
- vadd.s16 q14, q14, q15
- vqmovun.s16 d26, q13
- vqmovun.s16 d27, q14
-
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
- /* We can use much less instructions on little endian systems if the
- * OS kernel is not configured to trap unaligned memory accesses
- */
- vst1.32 {d26[0]}, [TMP1]!
- vst1.32 {d27[0]}, [TMP3]!
- vst1.32 {d26[1]}, [TMP2]!
- vst1.32 {d27[1]}, [TMP4]!
-#else
- vst1.8 {d26[0]}, [TMP1]!
- vst1.8 {d27[0]}, [TMP3]!
- vst1.8 {d26[1]}, [TMP1]!
- vst1.8 {d27[1]}, [TMP3]!
- vst1.8 {d26[2]}, [TMP1]!
- vst1.8 {d27[2]}, [TMP3]!
- vst1.8 {d26[3]}, [TMP1]!
- vst1.8 {d27[3]}, [TMP3]!
-
- vst1.8 {d26[4]}, [TMP2]!
- vst1.8 {d27[4]}, [TMP4]!
- vst1.8 {d26[5]}, [TMP2]!
- vst1.8 {d27[5]}, [TMP4]!
- vst1.8 {d26[6]}, [TMP2]!
- vst1.8 {d27[6]}, [TMP4]!
- vst1.8 {d26[7]}, [TMP2]!
- vst1.8 {d27[7]}, [TMP4]!
-#endif
-
- vpop {d8-d15}
- bx lr
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-jsimd_idct_2x2_neon_consts:
- .short -FIX_0_720959822 /* d0[0] */
- .short FIX_0_850430095 /* d0[1] */
- .short -FIX_1_272758580 /* d0[2] */
- .short FIX_3_624509785 /* d0[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
- vshll.s16 q14, \x4, #15
- vmull.s16 q13, \x6, d0[3]
- vmlal.s16 q13, \x10, d0[2]
- vmlal.s16 q13, \x12, d0[1]
- vmlal.s16 q13, \x16, d0[0]
-
- vadd.s32 q10, q14, q13
- vsub.s32 q14, q14, q13
-
- .if \shift > 16
- vrshr.s32 q10, q10, #\shift
- vrshr.s32 q14, q14, #\shift
- vmovn.s32 \y26, q10
- vmovn.s32 \y27, q14
- .else
- vrshrn.s32 \y26, q10, #\shift
- vrshrn.s32 \y27, q14, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req ip
-
- vpush {d8-d15}
-
- /* Load constants */
- adr TMP2, jsimd_idct_2x2_neon_consts
- vld1.16 {d0}, [TMP2, :64]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d4 | d5
- * 1 | d6 | d7
- * 2 | - | -
- * 3 | d10 | d11
- * 4 | - | -
- * 5 | d12 | d13
- * 6 | - | -
- * 7 | d16 | d17
- */
- vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
- /* Dequantize */
- vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
- vmul.s16 q2, q2, q9
- vmul.s16 q3, q3, q10
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d24, d25}, [DCT_TABLE, :128]!
- vmul.s16 q5, q5, q12
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d26, d27}, [DCT_TABLE, :128]!
- vmul.s16 q6, q6, q13
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d30, d31}, [DCT_TABLE, :128]!
- vmul.s16 q8, q8, q15
-
- /* Pass 1 */
-#if 0
- idct_helper d4, d6, d10, d12, d16, 13, d4, d6
- transpose_4x4 d4, d6, d8, d10
- idct_helper d5, d7, d11, d13, d17, 13, d5, d7
- transpose_4x4 d5, d7, d9, d11
-#else
- vmull.s16 q13, d6, d0[3]
- vmlal.s16 q13, d10, d0[2]
- vmlal.s16 q13, d12, d0[1]
- vmlal.s16 q13, d16, d0[0]
- vmull.s16 q12, d7, d0[3]
- vmlal.s16 q12, d11, d0[2]
- vmlal.s16 q12, d13, d0[1]
- vmlal.s16 q12, d17, d0[0]
- vshll.s16 q14, d4, #15
- vshll.s16 q15, d5, #15
- vadd.s32 q10, q14, q13
- vsub.s32 q14, q14, q13
- vrshrn.s32 d4, q10, #13
- vrshrn.s32 d6, q14, #13
- vadd.s32 q10, q15, q12
- vsub.s32 q14, q15, q12
- vrshrn.s32 d5, q10, #13
- vrshrn.s32 d7, q14, #13
- vtrn.16 q2, q3
- vtrn.32 q3, q5
-#endif
-
- /* Pass 2 */
- idct_helper d4, d6, d10, d7, d11, 20, d26, d27
-
- /* Range limit */
- vmov.u16 q15, #0x80
- vadd.s16 q13, q13, q15
- vqmovun.s16 d26, q13
- vqmovun.s16 d27, q13
-
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
-
- vst1.8 {d26[0]}, [TMP1]!
- vst1.8 {d27[4]}, [TMP1]!
- vst1.8 {d26[1]}, [TMP2]!
- vst1.8 {d27[5]}, [TMP2]!
-
- vpop {d8-d15}
- bx lr
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_ycc_extrgb_convert_neon
- * jsimd_ycc_extbgr_convert_neon
- * jsimd_ycc_extrgbx_convert_neon
- * jsimd_ycc_extbgrx_convert_neon
- * jsimd_ycc_extxbgr_convert_neon
- * jsimd_ycc_extxrgb_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-
-.macro do_load size
- .if \size == 8
- vld1.8 {d4}, [U, :64]!
- vld1.8 {d5}, [V, :64]!
- vld1.8 {d0}, [Y, :64]!
- pld [U, #64]
- pld [V, #64]
- pld [Y, #64]
- .elseif \size == 4
- vld1.8 {d4[0]}, [U]!
- vld1.8 {d4[1]}, [U]!
- vld1.8 {d4[2]}, [U]!
- vld1.8 {d4[3]}, [U]!
- vld1.8 {d5[0]}, [V]!
- vld1.8 {d5[1]}, [V]!
- vld1.8 {d5[2]}, [V]!
- vld1.8 {d5[3]}, [V]!
- vld1.8 {d0[0]}, [Y]!
- vld1.8 {d0[1]}, [Y]!
- vld1.8 {d0[2]}, [Y]!
- vld1.8 {d0[3]}, [Y]!
- .elseif \size == 2
- vld1.8 {d4[4]}, [U]!
- vld1.8 {d4[5]}, [U]!
- vld1.8 {d5[4]}, [V]!
- vld1.8 {d5[5]}, [V]!
- vld1.8 {d0[4]}, [Y]!
- vld1.8 {d0[5]}, [Y]!
- .elseif \size == 1
- vld1.8 {d4[6]}, [U]!
- vld1.8 {d5[6]}, [V]!
- vld1.8 {d0[6]}, [Y]!
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_store bpp, size
- .if \bpp == 24
- .if \size == 8
- vst3.8 {d10, d11, d12}, [RGB]!
- .elseif \size == 4
- vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
- vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
- vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
- vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
- .elseif \size == 2
- vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
- vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
- .elseif \size == 1
- vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 32
- .if \size == 8
- vst4.8 {d10, d11, d12, d13}, [RGB]!
- .elseif \size == 4
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
- .elseif \size == 2
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
- .elseif \size == 1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 16
- .if \size == 8
- vst1.16 {q15}, [RGB]!
- .elseif \size == 4
- vst1.16 {d30}, [RGB]!
- .elseif \size == 2
- vst1.16 {d31[0]}, [RGB]!
- vst1.16 {d31[1]}, [RGB]!
- .elseif \size == 1
- vst1.16 {d31[2]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
- vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
- vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
- vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
- vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
- vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
- vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
- vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
- vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
- vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
- vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
- vrshrn.s32 d20, q10, #15
- vrshrn.s32 d21, q11, #15
- vrshrn.s32 d24, q12, #14
- vrshrn.s32 d25, q13, #14
- vrshrn.s32 d28, q14, #14
- vrshrn.s32 d29, q15, #14
- vaddw.u8 q11, q10, d0
- vaddw.u8 q12, q12, d0
- vaddw.u8 q14, q14, d0
- .if \bpp != 16
- vqmovun.s16 d1\g_offs, q11
- vqmovun.s16 d1\r_offs, q12
- vqmovun.s16 d1\b_offs, q14
- .else /* rgb565 */
- vqshlu.s16 q13, q11, #8
- vqshlu.s16 q15, q12, #8
- vqshlu.s16 q14, q14, #8
- vsri.u16 q15, q13, #5
- vsri.u16 q15, q14, #11
- .endif
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1
- /* "do_yuv_to_rgb_stage2" and "store" */
- vrshrn.s32 d20, q10, #15
- /* "load" and "do_yuv_to_rgb_stage1" */
- pld [U, #64]
- vrshrn.s32 d21, q11, #15
- pld [V, #64]
- vrshrn.s32 d24, q12, #14
- vrshrn.s32 d25, q13, #14
- vld1.8 {d4}, [U, :64]!
- vrshrn.s32 d28, q14, #14
- vld1.8 {d5}, [V, :64]!
- vrshrn.s32 d29, q15, #14
- vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
- vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
- vaddw.u8 q11, q10, d0
- vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
- vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
- vaddw.u8 q12, q12, d0
- vaddw.u8 q14, q14, d0
- .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
- vqmovun.s16 d1\g_offs, q11
- pld [Y, #64]
- vqmovun.s16 d1\r_offs, q12
- vld1.8 {d0}, [Y, :64]!
- vqmovun.s16 d1\b_offs, q14
- vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
- vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
- do_store \bpp, 8
- vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
- vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
- vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
- vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
- .else /**************************** rgb565 ********************************/
- vqshlu.s16 q13, q11, #8
- pld [Y, #64]
- vqshlu.s16 q15, q12, #8
- vqshlu.s16 q14, q14, #8
- vld1.8 {d0}, [Y, :64]!
- vmull.s16 q11, d7, d1[1]
- vmlal.s16 q11, d9, d1[2]
- vsri.u16 q15, q13, #5
- vmull.s16 q12, d8, d1[0]
- vsri.u16 q15, q14, #11
- vmull.s16 q13, d9, d1[0]
- vmull.s16 q14, d6, d1[3]
- do_store \bpp, 8
- vmull.s16 q15, d7, d1[3]
- .endif
-.endm
-
-.macro do_yuv_to_rgb
- do_yuv_to_rgb_stage1
- do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
- .short 0, 0, 0, 0
- .short 22971, -11277, -23401, 29033
- .short -128, -128, -128, -128
- .short -128, -128, -128, -128
-
-asm_function jsimd_ycc_\colorid\()_convert_neon
- OUTPUT_WIDTH .req r0
- INPUT_BUF .req r1
- INPUT_ROW .req r2
- OUTPUT_BUF .req r3
- NUM_ROWS .req r4
-
- INPUT_BUF0 .req r5
- INPUT_BUF1 .req r6
- INPUT_BUF2 .req INPUT_BUF
-
- RGB .req r7
- Y .req r8
- U .req r9
- V .req r10
- N .req ip
-
- /* Load constants to d1, d2, d3 (d0 is just used for padding) */
- adr ip, jsimd_ycc_\colorid\()_neon_consts
- vld1.16 {d0, d1, d2, d3}, [ip, :128]
-
- /* Save ARM registers and handle input arguments */
- push {r4, r5, r6, r7, r8, r9, r10, lr}
- ldr NUM_ROWS, [sp, #(4 * 8)]
- ldr INPUT_BUF0, [INPUT_BUF]
- ldr INPUT_BUF1, [INPUT_BUF, #4]
- ldr INPUT_BUF2, [INPUT_BUF, #8]
- .unreq INPUT_BUF
-
- /* Save NEON registers */
- vpush {d8-d15}
-
- /* Initially set d10, d11, d12, d13 to 0xFF */
- vmov.u8 q5, #255
- vmov.u8 q6, #255
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- blt 9f
-0:
- ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
- ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
- mov N, OUTPUT_WIDTH
- ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
- add INPUT_ROW, INPUT_ROW, #1
- ldr RGB, [OUTPUT_BUF], #4
-
- /* Inner loop over pixels */
- subs N, N, #8
- blt 3f
- do_load 8
- do_yuv_to_rgb_stage1
- subs N, N, #8
- blt 2f
-1:
- do_yuv_to_rgb_stage2_store_load_stage1
- subs N, N, #8
- bge 1b
-2:
- do_yuv_to_rgb_stage2
- do_store \bpp, 8
- tst N, #7
- beq 8f
-3:
- tst N, #4
- beq 3f
- do_load 4
-3:
- tst N, #2
- beq 4f
- do_load 2
-4:
- tst N, #1
- beq 5f
- do_load 1
-5:
- do_yuv_to_rgb
- tst N, #4
- beq 6f
- do_store \bpp, 4
-6:
- tst N, #2
- beq 7f
- do_store \bpp, 2
-7:
- tst N, #1
- beq 8f
- do_store \bpp, 1
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- bgt 0b
-9:
- /* Restore all registers and return */
- vpop {d8-d15}
- pop {r4, r5, r6, r7, r8, r9, r10, pc}
-
- .unreq OUTPUT_WIDTH
- .unreq INPUT_ROW
- .unreq OUTPUT_BUF
- .unreq NUM_ROWS
- .unreq INPUT_BUF0
- .unreq INPUT_BUF1
- .unreq INPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R G B */
-generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
-generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_extrgb_ycc_convert_neon
- * jsimd_extbgr_ycc_convert_neon
- * jsimd_extrgbx_ycc_convert_neon
- * jsimd_extbgrx_ycc_convert_neon
- * jsimd_extxbgr_ycc_convert_neon
- * jsimd_extxrgb_ycc_convert_neon
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro do_store size
- .if \size == 8
- vst1.8 {d20}, [Y]!
- vst1.8 {d21}, [U]!
- vst1.8 {d22}, [V]!
- .elseif \size == 4
- vst1.8 {d20[0]}, [Y]!
- vst1.8 {d20[1]}, [Y]!
- vst1.8 {d20[2]}, [Y]!
- vst1.8 {d20[3]}, [Y]!
- vst1.8 {d21[0]}, [U]!
- vst1.8 {d21[1]}, [U]!
- vst1.8 {d21[2]}, [U]!
- vst1.8 {d21[3]}, [U]!
- vst1.8 {d22[0]}, [V]!
- vst1.8 {d22[1]}, [V]!
- vst1.8 {d22[2]}, [V]!
- vst1.8 {d22[3]}, [V]!
- .elseif \size == 2
- vst1.8 {d20[4]}, [Y]!
- vst1.8 {d20[5]}, [Y]!
- vst1.8 {d21[4]}, [U]!
- vst1.8 {d21[5]}, [U]!
- vst1.8 {d22[4]}, [V]!
- vst1.8 {d22[5]}, [V]!
- .elseif \size == 1
- vst1.8 {d20[6]}, [Y]!
- vst1.8 {d21[6]}, [U]!
- vst1.8 {d22[6]}, [V]!
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_load bpp, size
- .if \bpp == 24
- .if \size == 8
- vld3.8 {d10, d11, d12}, [RGB]!
- pld [RGB, #128]
- .elseif \size == 4
- vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
- vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
- vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
- vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
- .elseif \size == 2
- vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
- vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
- .elseif \size == 1
- vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 32
- .if \size == 8
- vld4.8 {d10, d11, d12, d13}, [RGB]!
- pld [RGB, #128]
- .elseif \size == 4
- vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
- vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
- vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
- vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
- .elseif \size == 2
- vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
- vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
- .elseif \size == 1
- vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined RGB->YCbCr conversion
- */
-
-.macro do_rgb_to_yuv_stage1
- vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
- vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
- vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
- vmull.u16 q7, d4, d0[0]
- vmlal.u16 q7, d6, d0[1]
- vmlal.u16 q7, d8, d0[2]
- vmull.u16 q8, d5, d0[0]
- vmlal.u16 q8, d7, d0[1]
- vmlal.u16 q8, d9, d0[2]
- vrev64.32 q9, q1
- vrev64.32 q13, q1
- vmlsl.u16 q9, d4, d0[3]
- vmlsl.u16 q9, d6, d1[0]
- vmlal.u16 q9, d8, d1[1]
- vmlsl.u16 q13, d5, d0[3]
- vmlsl.u16 q13, d7, d1[0]
- vmlal.u16 q13, d9, d1[1]
- vrev64.32 q14, q1
- vrev64.32 q15, q1
- vmlal.u16 q14, d4, d1[1]
- vmlsl.u16 q14, d6, d1[2]
- vmlsl.u16 q14, d8, d1[3]
- vmlal.u16 q15, d5, d1[1]
- vmlsl.u16 q15, d7, d1[2]
- vmlsl.u16 q15, d9, d1[3]
-.endm
-
-.macro do_rgb_to_yuv_stage2
- vrshrn.u32 d20, q7, #16
- vrshrn.u32 d21, q8, #16
- vshrn.u32 d22, q9, #16
- vshrn.u32 d23, q13, #16
- vshrn.u32 d24, q14, #16
- vshrn.u32 d25, q15, #16
- vmovn.u16 d20, q10 /* d20 = y */
- vmovn.u16 d21, q11 /* d21 = u */
- vmovn.u16 d22, q12 /* d22 = v */
-.endm
-
-.macro do_rgb_to_yuv
- do_rgb_to_yuv_stage1
- do_rgb_to_yuv_stage2
-.endm
-
-.macro do_rgb_to_yuv_stage2_store_load_stage1
- vrshrn.u32 d20, q7, #16
- vrshrn.u32 d21, q8, #16
- vshrn.u32 d22, q9, #16
- vrev64.32 q9, q1
- vshrn.u32 d23, q13, #16
- vrev64.32 q13, q1
- vshrn.u32 d24, q14, #16
- vshrn.u32 d25, q15, #16
- do_load \bpp, 8
- vmovn.u16 d20, q10 /* d20 = y */
- vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
- vmovn.u16 d21, q11 /* d21 = u */
- vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
- vmovn.u16 d22, q12 /* d22 = v */
- vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
- vmull.u16 q7, d4, d0[0]
- vmlal.u16 q7, d6, d0[1]
- vmlal.u16 q7, d8, d0[2]
- vst1.8 {d20}, [Y]!
- vmull.u16 q8, d5, d0[0]
- vmlal.u16 q8, d7, d0[1]
- vmlal.u16 q8, d9, d0[2]
- vmlsl.u16 q9, d4, d0[3]
- vmlsl.u16 q9, d6, d1[0]
- vmlal.u16 q9, d8, d1[1]
- vst1.8 {d21}, [U]!
- vmlsl.u16 q13, d5, d0[3]
- vmlsl.u16 q13, d7, d1[0]
- vmlal.u16 q13, d9, d1[1]
- vrev64.32 q14, q1
- vrev64.32 q15, q1
- vmlal.u16 q14, d4, d1[1]
- vmlsl.u16 q14, d6, d1[2]
- vmlsl.u16 q14, d8, d1[3]
- vst1.8 {d22}, [V]!
- vmlal.u16 q15, d5, d1[1]
- vmlsl.u16 q15, d7, d1[2]
- vmlsl.u16 q15, d9, d1[3]
-.endm
-
-.balign 16
-jsimd_\colorid\()_ycc_neon_consts:
- .short 19595, 38470, 7471, 11059
- .short 21709, 32768, 27439, 5329
- .short 32767, 128, 32767, 128
- .short 32767, 128, 32767, 128
-
-asm_function jsimd_\colorid\()_ycc_convert_neon
- OUTPUT_WIDTH .req r0
- INPUT_BUF .req r1
- OUTPUT_BUF .req r2
- OUTPUT_ROW .req r3
- NUM_ROWS .req r4
-
- OUTPUT_BUF0 .req r5
- OUTPUT_BUF1 .req r6
- OUTPUT_BUF2 .req OUTPUT_BUF
-
- RGB .req r7
- Y .req r8
- U .req r9
- V .req r10
- N .req ip
-
- /* Load constants to d0, d1, d2, d3 */
- adr ip, jsimd_\colorid\()_ycc_neon_consts
- vld1.16 {d0, d1, d2, d3}, [ip, :128]
-
- /* Save ARM registers and handle input arguments */
- push {r4, r5, r6, r7, r8, r9, r10, lr}
- ldr NUM_ROWS, [sp, #(4 * 8)]
- ldr OUTPUT_BUF0, [OUTPUT_BUF]
- ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
- ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
- .unreq OUTPUT_BUF
-
- /* Save NEON registers */
- vpush {d8-d15}
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- blt 9f
-0:
- ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
- ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
- mov N, OUTPUT_WIDTH
- ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
- add OUTPUT_ROW, OUTPUT_ROW, #1
- ldr RGB, [INPUT_BUF], #4
-
- /* Inner loop over pixels */
- subs N, N, #8
- blt 3f
- do_load \bpp, 8
- do_rgb_to_yuv_stage1
- subs N, N, #8
- blt 2f
-1:
- do_rgb_to_yuv_stage2_store_load_stage1
- subs N, N, #8
- bge 1b
-2:
- do_rgb_to_yuv_stage2
- do_store 8
- tst N, #7
- beq 8f
-3:
- tst N, #4
- beq 3f
- do_load \bpp, 4
-3:
- tst N, #2
- beq 4f
- do_load \bpp, 2
-4:
- tst N, #1
- beq 5f
- do_load \bpp, 1
-5:
- do_rgb_to_yuv
- tst N, #4
- beq 6f
- do_store 4
-6:
- tst N, #2
- beq 7f
- do_store 2
-7:
- tst N, #1
- beq 8f
- do_store 1
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- bgt 0b
-9:
- /* Restore all registers and return */
- vpop {d8-d15}
- pop {r4, r5, r6, r7, r8, r9, r10, pc}
-
- .unreq OUTPUT_WIDTH
- .unreq OUTPUT_ROW
- .unreq INPUT_BUF
- .unreq NUM_ROWS
- .unreq OUTPUT_BUF0
- .unreq OUTPUT_BUF1
- .unreq OUTPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_rgb_to_yuv
-.purgem do_rgb_to_yuv_stage1
-.purgem do_rgb_to_yuv_stage2
-.purgem do_rgb_to_yuv_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R G B */
-generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req r0
- START_COL .req r1
- WORKSPACE .req r2
- TMP1 .req r3
- TMP2 .req r4
- TMP3 .req r5
- TMP4 .req ip
-
- push {r4, r5}
- vmov.u8 d0, #128
-
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d16}, [TMP1]
- vsubl.u8 q8, d16, d0
- vld1.8 {d18}, [TMP2]
- vsubl.u8 q9, d18, d0
- vld1.8 {d20}, [TMP3]
- vsubl.u8 q10, d20, d0
- vld1.8 {d22}, [TMP4]
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- vsubl.u8 q11, d22, d0
- vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d24}, [TMP1]
- vsubl.u8 q12, d24, d0
- vld1.8 {d26}, [TMP2]
- vsubl.u8 q13, d26, d0
- vld1.8 {d28}, [TMP3]
- vsubl.u8 q14, d28, d0
- vld1.8 {d30}, [TMP4]
- vsubl.u8 q15, d30, d0
- vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
- vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
- pop {r4, r5}
- bx lr
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- * rid of a bunch of VLD1.16 instructions
- */
-
-#define XFIX_0_382683433 d0[0]
-#define XFIX_0_541196100 d0[1]
-#define XFIX_0_707106781 d0[2]
-#define XFIX_1_306562965 d0[3]
-
-.balign 16
-jsimd_fdct_ifast_neon_consts:
- .short (98 * 128) /* XFIX_0_382683433 */
- .short (139 * 128) /* XFIX_0_541196100 */
- .short (181 * 128) /* XFIX_0_707106781 */
- .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
- DATA .req r0
- TMP .req ip
-
- vpush {d8-d15}
-
- /* Load constants */
- adr TMP, jsimd_fdct_ifast_neon_consts
- vld1.16 {d0}, [TMP, :64]
-
- /* Load all DATA into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 | q8
- * 1 | d18 | d19 | q9
- * 2 | d20 | d21 | q10
- * 3 | d22 | d23 | q11
- * 4 | d24 | d25 | q12
- * 5 | d26 | d27 | q13
- * 6 | d28 | d29 | q14
- * 7 | d30 | d31 | q15
- */
-
- vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
- vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
- vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
- vld1.16 {d28, d29, d30, d31}, [DATA, :128]
- sub DATA, DATA, #(128 - 32)
-
- mov TMP, #2
-1:
- /* Transpose */
- vtrn.16 q12, q13
- vtrn.16 q10, q11
- vtrn.16 q8, q9
- vtrn.16 q14, q15
- vtrn.32 q9, q11
- vtrn.32 q13, q15
- vtrn.32 q8, q10
- vtrn.32 q12, q14
- vswp d30, d23
- vswp d24, d17
- vswp d26, d19
- /* 1-D FDCT */
- vadd.s16 q2, q11, q12
- vswp d28, d21
- vsub.s16 q12, q11, q12
- vsub.s16 q6, q10, q13
- vadd.s16 q10, q10, q13
- vsub.s16 q7, q9, q14
- vadd.s16 q9, q9, q14
- vsub.s16 q1, q8, q15
- vadd.s16 q8, q8, q15
- vsub.s16 q4, q9, q10
- vsub.s16 q5, q8, q2
- vadd.s16 q3, q9, q10
- vadd.s16 q4, q4, q5
- vadd.s16 q2, q8, q2
- vqdmulh.s16 q4, q4, XFIX_0_707106781
- vadd.s16 q11, q12, q6
- vadd.s16 q8, q2, q3
- vsub.s16 q12, q2, q3
- vadd.s16 q3, q6, q7
- vadd.s16 q7, q7, q1
- vqdmulh.s16 q3, q3, XFIX_0_707106781
- vsub.s16 q6, q11, q7
- vadd.s16 q10, q5, q4
- vqdmulh.s16 q6, q6, XFIX_0_382683433
- vsub.s16 q14, q5, q4
- vqdmulh.s16 q11, q11, XFIX_0_541196100
- vqdmulh.s16 q5, q7, XFIX_1_306562965
- vadd.s16 q4, q1, q3
- vsub.s16 q3, q1, q3
- vadd.s16 q7, q7, q6
- vadd.s16 q11, q11, q6
- vadd.s16 q7, q7, q5
- vadd.s16 q13, q3, q11
- vsub.s16 q11, q3, q11
- vadd.s16 q9, q4, q7
- vsub.s16 q15, q4, q7
- subs TMP, TMP, #1
- bne 1b
-
- /* store results */
- vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
- vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
- vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
- vst1.16 {d28, d29, d30, d31}, [DATA, :128]
-
- vpop {d8-d15}
- bx lr
-
- .unreq DATA
- .unreq TMP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
- * DCTELEM *workspace);
- *
- * Note: the code uses 2 stage pipelining in order to improve instructions
- * scheduling and eliminate stalls (this provides ~15% better
- * performance for this function on both ARM Cortex-A8 and
- * ARM Cortex-A9 when compared to the non-pipelined variant).
- * The instructions which belong to the second stage use different
- * indentation for better readiability.
- */
-asm_function jsimd_quantize_neon
-
- COEF_BLOCK .req r0
- DIVISORS .req r1
- WORKSPACE .req r2
-
- RECIPROCAL .req DIVISORS
- CORRECTION .req r3
- SHIFT .req ip
- LOOP_COUNT .req r4
-
- vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
- vabs.s16 q12, q0
- add CORRECTION, DIVISORS, #(64 * 2)
- add SHIFT, DIVISORS, #(64 * 6)
- vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
- vabs.s16 q13, q1
- vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
- vadd.u16 q12, q12, q10 /* add correction */
- vadd.u16 q13, q13, q11
- vmull.u16 q10, d24, d16 /* multiply by reciprocal */
- vmull.u16 q11, d25, d17
- vmull.u16 q8, d26, d18
- vmull.u16 q9, d27, d19
- vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
- vshrn.u32 d20, q10, #16
- vshrn.u32 d21, q11, #16
- vshrn.u32 d22, q8, #16
- vshrn.u32 d23, q9, #16
- vneg.s16 q12, q12
- vneg.s16 q13, q13
- vshr.s16 q2, q0, #15 /* extract sign */
- vshr.s16 q3, q1, #15
- vshl.u16 q14, q10, q12 /* shift */
- vshl.u16 q15, q11, q13
-
- push {r4, r5}
- mov LOOP_COUNT, #3
-1:
- vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
- veor.u16 q14, q14, q2 /* restore sign */
- vabs.s16 q12, q0
- vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
- vabs.s16 q13, q1
- veor.u16 q15, q15, q3
- vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
- vadd.u16 q12, q12, q10 /* add correction */
- vadd.u16 q13, q13, q11
- vmull.u16 q10, d24, d16 /* multiply by reciprocal */
- vmull.u16 q11, d25, d17
- vmull.u16 q8, d26, d18
- vmull.u16 q9, d27, d19
- vsub.u16 q14, q14, q2
- vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
- vsub.u16 q15, q15, q3
- vshrn.u32 d20, q10, #16
- vshrn.u32 d21, q11, #16
- vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
- vshrn.u32 d22, q8, #16
- vshrn.u32 d23, q9, #16
- vneg.s16 q12, q12
- vneg.s16 q13, q13
- vshr.s16 q2, q0, #15 /* extract sign */
- vshr.s16 q3, q1, #15
- vshl.u16 q14, q10, q12 /* shift */
- vshl.u16 q15, q11, q13
- subs LOOP_COUNT, LOOP_COUNT, #1
- bne 1b
- pop {r4, r5}
-
- veor.u16 q14, q14, q2 /* restore sign */
- veor.u16 q15, q15, q3
- vsub.u16 q14, q14, q2
- vsub.u16 q15, q15, q3
- vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-
- bx lr /* return */
-
- .unreq COEF_BLOCK
- .unreq DIVISORS
- .unreq WORKSPACE
- .unreq RECIPROCAL
- .unreq CORRECTION
- .unreq SHIFT
- .unreq LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
- * JDIMENSION downsampled_width,
- * JSAMPARRAY input_data,
- * JSAMPARRAY *output_data_ptr);
- *
- * Note: the use of unaligned writes is the main remaining bottleneck in
- * this code, which can be potentially solved to get up to tens
- * of percents performance improvement on Cortex-A8/Cortex-A9.
- */
-
-/*
- * Upsample 16 source pixels to 32 destination pixels. The new 16 source
- * pixels are loaded to q0. The previous 16 source pixels are in q1. The
- * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
- * Register d28 is used for multiplication by 3. Register q15 is used
- * for adding +1 bias.
- */
-.macro upsample16 OUTPTR, INPTR
- vld1.8 {q0}, [\INPTR]!
- vmovl.u8 q8, d0
- vext.8 q2, q1, q0, #15
- vmovl.u8 q9, d1
- vaddw.u8 q10, q15, d4
- vaddw.u8 q11, q15, d5
- vmlal.u8 q8, d4, d28
- vmlal.u8 q9, d5, d28
- vmlal.u8 q10, d0, d28
- vmlal.u8 q11, d1, d28
- vmov q1, q0 /* backup source pixels to q1 */
- vrshrn.u16 d6, q8, #2
- vrshrn.u16 d7, q9, #2
- vshrn.u16 d8, q10, #2
- vshrn.u16 d9, q11, #2
- vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
- * macro, the roles of q0 and q1 registers are reversed for even and odd
- * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
- * Also this unrolling allows to reorder loads and stores to compensate
- * multiplication latency and reduce stalls.
- */
-.macro upsample32 OUTPTR, INPTR
- /* even 16 pixels group */
- vld1.8 {q0}, [\INPTR]!
- vmovl.u8 q8, d0
- vext.8 q2, q1, q0, #15
- vmovl.u8 q9, d1
- vaddw.u8 q10, q15, d4
- vaddw.u8 q11, q15, d5
- vmlal.u8 q8, d4, d28
- vmlal.u8 q9, d5, d28
- vmlal.u8 q10, d0, d28
- vmlal.u8 q11, d1, d28
- /* odd 16 pixels group */
- vld1.8 {q1}, [\INPTR]!
- vrshrn.u16 d6, q8, #2
- vrshrn.u16 d7, q9, #2
- vshrn.u16 d8, q10, #2
- vshrn.u16 d9, q11, #2
- vmovl.u8 q8, d2
- vext.8 q2, q0, q1, #15
- vmovl.u8 q9, d3
- vaddw.u8 q10, q15, d4
- vaddw.u8 q11, q15, d5
- vmlal.u8 q8, d4, d28
- vmlal.u8 q9, d5, d28
- vmlal.u8 q10, d2, d28
- vmlal.u8 q11, d3, d28
- vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
- vrshrn.u16 d6, q8, #2
- vrshrn.u16 d7, q9, #2
- vshrn.u16 d8, q10, #2
- vshrn.u16 d9, q11, #2
- vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
- */
-.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
- /* special case for the first and last pixels */
- sub \WIDTH, \WIDTH, #1
- add \OUTPTR, \OUTPTR, #1
- ldrb \TMP1, [\INPTR, \WIDTH]
- strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
- ldrb \TMP1, [\INPTR], #1
- strb \TMP1, [\OUTPTR, #-1]
- vmov.8 d3[7], \TMP1
-
- subs \WIDTH, \WIDTH, #32
- blt 5f
-0: /* process 32 pixels per iteration */
- upsample32 \OUTPTR, \INPTR
- subs \WIDTH, \WIDTH, #32
- bge 0b
-5:
- adds \WIDTH, \WIDTH, #16
- blt 1f
-0: /* process 16 pixels if needed */
- upsample16 \OUTPTR, \INPTR
- subs \WIDTH, \WIDTH, #16
-1:
- adds \WIDTH, \WIDTH, #16
- beq 9f
-
- /* load the remaining 1-15 pixels */
- add \INPTR, \INPTR, \WIDTH
- tst \WIDTH, #1
- beq 2f
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[0]}, [\INPTR]
-2:
- tst \WIDTH, #2
- beq 2f
- vext.8 d0, d0, d0, #6
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[1]}, [\INPTR]
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[0]}, [\INPTR]
-2:
- tst \WIDTH, #4
- beq 2f
- vrev64.32 d0, d0
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[3]}, [\INPTR]
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[2]}, [\INPTR]
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[1]}, [\INPTR]
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[0]}, [\INPTR]
-2:
- tst \WIDTH, #8
- beq 2f
- vmov d1, d0
- sub \INPTR, \INPTR, #8
- vld1.8 {d0}, [\INPTR]
-2: /* upsample the remaining pixels */
- vmovl.u8 q8, d0
- vext.8 q2, q1, q0, #15
- vmovl.u8 q9, d1
- vaddw.u8 q10, q15, d4
- vaddw.u8 q11, q15, d5
- vmlal.u8 q8, d4, d28
- vmlal.u8 q9, d5, d28
- vmlal.u8 q10, d0, d28
- vmlal.u8 q11, d1, d28
- vrshrn.u16 d10, q8, #2
- vrshrn.u16 d12, q9, #2
- vshrn.u16 d11, q10, #2
- vshrn.u16 d13, q11, #2
- vzip.8 d10, d11
- vzip.8 d12, d13
- /* store the remaining pixels */
- tst \WIDTH, #8
- beq 2f
- vst1.8 {d10, d11}, [\OUTPTR]!
- vmov q5, q6
-2:
- tst \WIDTH, #4
- beq 2f
- vst1.8 {d10}, [\OUTPTR]!
- vmov d10, d11
-2:
- tst \WIDTH, #2
- beq 2f
- vst1.8 {d10[0]}, [\OUTPTR]!
- vst1.8 {d10[1]}, [\OUTPTR]!
- vst1.8 {d10[2]}, [\OUTPTR]!
- vst1.8 {d10[3]}, [\OUTPTR]!
- vext.8 d10, d10, d10, #4
-2:
- tst \WIDTH, #1
- beq 2f
- vst1.8 {d10[0]}, [\OUTPTR]!
- vst1.8 {d10[1]}, [\OUTPTR]!
-2:
-9:
-.endm
-
-asm_function jsimd_h2v1_fancy_upsample_neon
-
- MAX_V_SAMP_FACTOR .req r0
- DOWNSAMPLED_WIDTH .req r1
- INPUT_DATA .req r2
- OUTPUT_DATA_PTR .req r3
- OUTPUT_DATA .req OUTPUT_DATA_PTR
-
- OUTPTR .req r4
- INPTR .req r5
- WIDTH .req ip
- TMP .req lr
-
- push {r4, r5, r6, lr}
- vpush {d8-d15}
-
- ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
- cmp MAX_V_SAMP_FACTOR, #0
- ble 99f
-
- /* initialize constants */
- vmov.u8 d28, #3
- vmov.u16 q15, #1
-11:
- ldr INPTR, [INPUT_DATA], #4
- ldr OUTPTR, [OUTPUT_DATA], #4
- mov WIDTH, DOWNSAMPLED_WIDTH
- upsample_row OUTPTR, INPTR, WIDTH, TMP
- subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
- bgt 11b
-
-99:
- vpop {d8-d15}
- pop {r4, r5, r6, pc}
-
- .unreq MAX_V_SAMP_FACTOR
- .unreq DOWNSAMPLED_WIDTH
- .unreq INPUT_DATA
- .unreq OUTPUT_DATA_PTR
- .unreq OUTPUT_DATA
-
- .unreq OUTPTR
- .unreq INPTR
- .unreq WIDTH
- .unreq TMP
-
-.purgem upsample16
-.purgem upsample32
-.purgem upsample_row
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
- * JCOEFPTR block, int last_dc_val,
- * c_derived_tbl *dctbl, c_derived_tbl *actbl)
- *
- */
-
-.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
- sub \PUT_BITS, \PUT_BITS, #0x8
- lsr \TMP, \PUT_BUFFER, \PUT_BITS
- uxtb \TMP, \TMP
- strb \TMP, [\BUFFER, #1]!
- cmp \TMP, #0xff
- /*it eq*/
- strbeq \ZERO, [\BUFFER, #1]!
-.endm
-
-.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
- /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
- add \PUT_BITS, \SIZE
- /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
- orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
-.endm
-
-.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
- cmp \PUT_BITS, #0x10
- blt 15f
- eor \ZERO, \ZERO, \ZERO
- emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
- emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-15:
-.endm
-
-.balign 16
-jsimd_huff_encode_one_block_neon_consts:
- .byte 0x01
- .byte 0x02
- .byte 0x04
- .byte 0x08
- .byte 0x10
- .byte 0x20
- .byte 0x40
- .byte 0x80
-
-asm_function jsimd_huff_encode_one_block_neon
- push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
- add r7, sp, #0x1c
- sub r4, sp, #0x40
- bfc r4, #0, #5
- mov sp, r4 /* align sp on 32 bytes */
- vst1.64 {d8, d9, d10, d11}, [r4, :128]!
- vst1.64 {d12, d13, d14, d15}, [r4, :128]
- sub sp, #0x140 /* reserve 320 bytes */
- str r0, [sp, #0x18] /* working state > sp + Ox18 */
- add r4, sp, #0x20 /* r4 = t1 */
- ldr lr, [r7, #0x8] /* lr = dctbl */
- sub r10, r1, #0x1 /* r10=buffer-- */
- ldrsh r1, [r2]
- mov r9, #0x10
- mov r8, #0x1
- adr r5, jsimd_huff_encode_one_block_neon_consts
- /* prepare data */
- vld1.8 {d26}, [r5, :64]
- veor q8, q8, q8
- veor q9, q9, q9
- vdup.16 q14, r9
- vdup.16 q15, r8
- veor q10, q10, q10
- veor q11, q11, q11
- sub r1, r1, r3
- add r9, r2, #0x22
- add r8, r2, #0x18
- add r3, r2, #0x36
- vmov.16 d0[0], r1
- vld1.16 {d2[0]}, [r9, :16]
- vld1.16 {d4[0]}, [r8, :16]
- vld1.16 {d6[0]}, [r3, :16]
- add r1, r2, #0x2
- add r9, r2, #0x30
- add r8, r2, #0x26
- add r3, r2, #0x28
- vld1.16 {d0[1]}, [r1, :16]
- vld1.16 {d2[1]}, [r9, :16]
- vld1.16 {d4[1]}, [r8, :16]
- vld1.16 {d6[1]}, [r3, :16]
- add r1, r2, #0x10
- add r9, r2, #0x40
- add r8, r2, #0x34
- add r3, r2, #0x1a
- vld1.16 {d0[2]}, [r1, :16]
- vld1.16 {d2[2]}, [r9, :16]
- vld1.16 {d4[2]}, [r8, :16]
- vld1.16 {d6[2]}, [r3, :16]
- add r1, r2, #0x20
- add r9, r2, #0x32
- add r8, r2, #0x42
- add r3, r2, #0xc
- vld1.16 {d0[3]}, [r1, :16]
- vld1.16 {d2[3]}, [r9, :16]
- vld1.16 {d4[3]}, [r8, :16]
- vld1.16 {d6[3]}, [r3, :16]
- add r1, r2, #0x12
- add r9, r2, #0x24
- add r8, r2, #0x50
- add r3, r2, #0xe
- vld1.16 {d1[0]}, [r1, :16]
- vld1.16 {d3[0]}, [r9, :16]
- vld1.16 {d5[0]}, [r8, :16]
- vld1.16 {d7[0]}, [r3, :16]
- add r1, r2, #0x4
- add r9, r2, #0x16
- add r8, r2, #0x60
- add r3, r2, #0x1c
- vld1.16 {d1[1]}, [r1, :16]
- vld1.16 {d3[1]}, [r9, :16]
- vld1.16 {d5[1]}, [r8, :16]
- vld1.16 {d7[1]}, [r3, :16]
- add r1, r2, #0x6
- add r9, r2, #0x8
- add r8, r2, #0x52
- add r3, r2, #0x2a
- vld1.16 {d1[2]}, [r1, :16]
- vld1.16 {d3[2]}, [r9, :16]
- vld1.16 {d5[2]}, [r8, :16]
- vld1.16 {d7[2]}, [r3, :16]
- add r1, r2, #0x14
- add r9, r2, #0xa
- add r8, r2, #0x44
- add r3, r2, #0x38
- vld1.16 {d1[3]}, [r1, :16]
- vld1.16 {d3[3]}, [r9, :16]
- vld1.16 {d5[3]}, [r8, :16]
- vld1.16 {d7[3]}, [r3, :16]
- vcgt.s16 q8, q8, q0
- vcgt.s16 q9, q9, q1
- vcgt.s16 q10, q10, q2
- vcgt.s16 q11, q11, q3
- vabs.s16 q0, q0
- vabs.s16 q1, q1
- vabs.s16 q2, q2
- vabs.s16 q3, q3
- veor q8, q8, q0
- veor q9, q9, q1
- veor q10, q10, q2
- veor q11, q11, q3
- add r9, r4, #0x20
- add r8, r4, #0x80
- add r3, r4, #0xa0
- vclz.i16 q0, q0
- vclz.i16 q1, q1
- vclz.i16 q2, q2
- vclz.i16 q3, q3
- vsub.i16 q0, q14, q0
- vsub.i16 q1, q14, q1
- vsub.i16 q2, q14, q2
- vsub.i16 q3, q14, q3
- vst1.16 {d0, d1, d2, d3}, [r4, :256]
- vst1.16 {d4, d5, d6, d7}, [r9, :256]
- vshl.s16 q0, q15, q0
- vshl.s16 q1, q15, q1
- vshl.s16 q2, q15, q2
- vshl.s16 q3, q15, q3
- vsub.i16 q0, q0, q15
- vsub.i16 q1, q1, q15
- vsub.i16 q2, q2, q15
- vsub.i16 q3, q3, q15
- vand q8, q8, q0
- vand q9, q9, q1
- vand q10, q10, q2
- vand q11, q11, q3
- vst1.16 {d16, d17, d18, d19}, [r8, :256]
- vst1.16 {d20, d21, d22, d23}, [r3, :256]
- add r1, r2, #0x46
- add r9, r2, #0x3a
- add r8, r2, #0x74
- add r3, r2, #0x6a
- vld1.16 {d8[0]}, [r1, :16]
- vld1.16 {d10[0]}, [r9, :16]
- vld1.16 {d12[0]}, [r8, :16]
- vld1.16 {d14[0]}, [r3, :16]
- veor q8, q8, q8
- veor q9, q9, q9
- veor q10, q10, q10
- veor q11, q11, q11
- add r1, r2, #0x54
- add r9, r2, #0x2c
- add r8, r2, #0x76
- add r3, r2, #0x78
- vld1.16 {d8[1]}, [r1, :16]
- vld1.16 {d10[1]}, [r9, :16]
- vld1.16 {d12[1]}, [r8, :16]
- vld1.16 {d14[1]}, [r3, :16]
- add r1, r2, #0x62
- add r9, r2, #0x1e
- add r8, r2, #0x68
- add r3, r2, #0x7a
- vld1.16 {d8[2]}, [r1, :16]
- vld1.16 {d10[2]}, [r9, :16]
- vld1.16 {d12[2]}, [r8, :16]
- vld1.16 {d14[2]}, [r3, :16]
- add r1, r2, #0x70
- add r9, r2, #0x2e
- add r8, r2, #0x5a
- add r3, r2, #0x6c
- vld1.16 {d8[3]}, [r1, :16]
- vld1.16 {d10[3]}, [r9, :16]
- vld1.16 {d12[3]}, [r8, :16]
- vld1.16 {d14[3]}, [r3, :16]
- add r1, r2, #0x72
- add r9, r2, #0x3c
- add r8, r2, #0x4c
- add r3, r2, #0x5e
- vld1.16 {d9[0]}, [r1, :16]
- vld1.16 {d11[0]}, [r9, :16]
- vld1.16 {d13[0]}, [r8, :16]
- vld1.16 {d15[0]}, [r3, :16]
- add r1, r2, #0x64
- add r9, r2, #0x4a
- add r8, r2, #0x3e
- add r3, r2, #0x6e
- vld1.16 {d9[1]}, [r1, :16]
- vld1.16 {d11[1]}, [r9, :16]
- vld1.16 {d13[1]}, [r8, :16]
- vld1.16 {d15[1]}, [r3, :16]
- add r1, r2, #0x56
- add r9, r2, #0x58
- add r8, r2, #0x4e
- add r3, r2, #0x7c
- vld1.16 {d9[2]}, [r1, :16]
- vld1.16 {d11[2]}, [r9, :16]
- vld1.16 {d13[2]}, [r8, :16]
- vld1.16 {d15[2]}, [r3, :16]
- add r1, r2, #0x48
- add r9, r2, #0x66
- add r8, r2, #0x5c
- add r3, r2, #0x7e
- vld1.16 {d9[3]}, [r1, :16]
- vld1.16 {d11[3]}, [r9, :16]
- vld1.16 {d13[3]}, [r8, :16]
- vld1.16 {d15[3]}, [r3, :16]
- vcgt.s16 q8, q8, q4
- vcgt.s16 q9, q9, q5
- vcgt.s16 q10, q10, q6
- vcgt.s16 q11, q11, q7
- vabs.s16 q4, q4
- vabs.s16 q5, q5
- vabs.s16 q6, q6
- vabs.s16 q7, q7
- veor q8, q8, q4
- veor q9, q9, q5
- veor q10, q10, q6
- veor q11, q11, q7
- add r1, r4, #0x40
- add r9, r4, #0x60
- add r8, r4, #0xc0
- add r3, r4, #0xe0
- vclz.i16 q4, q4
- vclz.i16 q5, q5
- vclz.i16 q6, q6
- vclz.i16 q7, q7
- vsub.i16 q4, q14, q4
- vsub.i16 q5, q14, q5
- vsub.i16 q6, q14, q6
- vsub.i16 q7, q14, q7
- vst1.16 {d8, d9, d10, d11}, [r1, :256]
- vst1.16 {d12, d13, d14, d15}, [r9, :256]
- vshl.s16 q4, q15, q4
- vshl.s16 q5, q15, q5
- vshl.s16 q6, q15, q6
- vshl.s16 q7, q15, q7
- vsub.i16 q4, q4, q15
- vsub.i16 q5, q5, q15
- vsub.i16 q6, q6, q15
- vsub.i16 q7, q7, q15
- vand q8, q8, q4
- vand q9, q9, q5
- vand q10, q10, q6
- vand q11, q11, q7
- vst1.16 {d16, d17, d18, d19}, [r8, :256]
- vst1.16 {d20, d21, d22, d23}, [r3, :256]
- ldr r12, [r7, #0xc] /* r12 = actbl */
- add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
- mov r9, r12 /* r9 = actbl */
- add r6, r4, #0x80 /* r6 = t2 */
- ldr r11, [r0, #0x8] /* r11 = put_buffer */
- ldr r4, [r0, #0xc] /* r4 = put_bits */
- ldrh r2, [r6, #-128] /* r2 = nbits */
- ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<<nbits) - 1; */
- ldr r0, [lr, r2, lsl #2]
- ldrb r5, [r1, r2]
- put_bits r11, r4, r0, r5
- checkbuf15 r10, r11, r4, r5, r0
- put_bits r11, r4, r3, r2
- checkbuf15 r10, r11, r4, r5, r0
- mov lr, r6 /* lr = t2 */
- add r5, r9, #0x400 /* r5 = actbl->ehufsi */
- ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
- veor q8, q8, q8
- vceq.i16 q0, q0, q8
- vceq.i16 q1, q1, q8
- vceq.i16 q2, q2, q8
- vceq.i16 q3, q3, q8
- vceq.i16 q4, q4, q8
- vceq.i16 q5, q5, q8
- vceq.i16 q6, q6, q8
- vceq.i16 q7, q7, q8
- vmovn.i16 d0, q0
- vmovn.i16 d2, q1
- vmovn.i16 d4, q2
- vmovn.i16 d6, q3
- vmovn.i16 d8, q4
- vmovn.i16 d10, q5
- vmovn.i16 d12, q6
- vmovn.i16 d14, q7
- vand d0, d0, d26
- vand d2, d2, d26
- vand d4, d4, d26
- vand d6, d6, d26
- vand d8, d8, d26
- vand d10, d10, d26
- vand d12, d12, d26
- vand d14, d14, d26
- vpadd.i8 d0, d0, d2
- vpadd.i8 d4, d4, d6
- vpadd.i8 d8, d8, d10
- vpadd.i8 d12, d12, d14
- vpadd.i8 d0, d0, d4
- vpadd.i8 d8, d8, d12
- vpadd.i8 d0, d0, d8
- vmov.32 r1, d0[1]
- vmov.32 r8, d0[0]
- mvn r1, r1
- mvn r8, r8
- lsrs r1, r1, #0x1
- rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
- rbit r1, r1 /* r1 = index1 */
- rbit r8, r8 /* r8 = index0 */
- ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
- str r1, [sp, #0x14] /* index1 > sp + 0x14 */
- cmp r8, #0x0
- beq 6f
-1:
- clz r2, r8
- add lr, lr, r2, lsl #1
- lsl r8, r8, r2
- ldrh r1, [lr, #-126]
-2:
- cmp r2, #0x10
- blt 3f
- sub r2, r2, #0x10
- put_bits r11, r4, r0, r6
- cmp r4, #0x10
- blt 2b
- eor r3, r3, r3
- emit_byte r10, r11, r4, r3, r12
- emit_byte r10, r11, r4, r3, r12
- b 2b
-3:
- add r2, r1, r2, lsl #4
- ldrh r3, [lr, #2]!
- ldr r12, [r9, r2, lsl #2]
- ldrb r2, [r5, r2]
- put_bits r11, r4, r12, r2
- checkbuf15 r10, r11, r4, r2, r12
- put_bits r11, r4, r3, r1
- checkbuf15 r10, r11, r4, r2, r12
- lsls r8, r8, #0x1
- bne 1b
-6:
- add r12, sp, #0x20 /* r12 = t1 */
- ldr r8, [sp, #0x14] /* r8 = index1 */
- adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
- cmp r8, #0x0
- beq 6f
- clz r2, r8
- sub r12, r12, lr
- lsl r8, r8, r2
- add r2, r2, r12, lsr #1
- add lr, lr, r2, lsl #1
- b 7f
-1:
- clz r2, r8
- add lr, lr, r2, lsl #1
- lsl r8, r8, r2
-7:
- ldrh r1, [lr, #-126]
-2:
- cmp r2, #0x10
- blt 3f
- sub r2, r2, #0x10
- put_bits r11, r4, r0, r6
- cmp r4, #0x10
- blt 2b
- eor r3, r3, r3
- emit_byte r10, r11, r4, r3, r12
- emit_byte r10, r11, r4, r3, r12
- b 2b
-3:
- add r2, r1, r2, lsl #4
- ldrh r3, [lr, #2]!
- ldr r12, [r9, r2, lsl #2]
- ldrb r2, [r5, r2]
- put_bits r11, r4, r12, r2
- checkbuf15 r10, r11, r4, r2, r12
- put_bits r11, r4, r3, r1
- checkbuf15 r10, r11, r4, r2, r12
- lsls r8, r8, #0x1
- bne 1b
-6:
- add r0, sp, #0x20
- add r0, #0xfe
- cmp lr, r0
- bhs 1f
- ldr r1, [r9]
- ldrb r0, [r5]
- put_bits r11, r4, r1, r0
- checkbuf15 r10, r11, r4, r0, r1
-1:
- ldr r12, [sp, #0x18]
- str r11, [r12, #0x8]
- str r4, [r12, #0xc]
- add r0, r10, #0x1
- add r4, sp, #0x140
- vld1.64 {d8, d9, d10, d11}, [r4, :128]!
- vld1.64 {d12, d13, d14, d15}, [r4, :128]
- sub r4, r7, #0x1c
- mov sp, r4
- pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-.purgem emit_byte
-.purgem put_bits
-.purgem checkbuf15
diff --git a/media/libjpeg/simd/jsimd_i386.c b/media/libjpeg/simd/jsimd_i386.c
deleted file mode 100644
index 6da8bd8913..0000000000
--- a/media/libjpeg/simd/jsimd_i386.c
+++ /dev/null
@@ -1,1091 +0,0 @@
-/*
- * jsimd_i386.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 32-bit x86 architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-/*
- * In the PIC cases, we have no guarantee that constants will keep
- * their alignment. This macro allows us to verify it at runtime.
- */
-#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
-
-#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
- char *env = NULL;
-
- if (simd_support != ~0U)
- return;
-
- simd_support = jpeg_simd_cpu_support();
-
- /* Force different settings through environment variables */
- env = getenv("JSIMD_FORCEMMX");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support &= JSIMD_MMX;
- env = getenv("JSIMD_FORCE3DNOW");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support &= JSIMD_3DNOW|JSIMD_MMX;
- env = getenv("JSIMD_FORCESSE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support &= JSIMD_SSE|JSIMD_MMX;
- env = getenv("JSIMD_FORCESSE2");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support &= JSIMD_SSE2;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = 0;
- env = getenv("JSIMD_NOHUFFENC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
- void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_extrgb_ycc_convert_sse2;
- mmxfct=jsimd_extrgb_ycc_convert_mmx;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_extrgbx_ycc_convert_sse2;
- mmxfct=jsimd_extrgbx_ycc_convert_mmx;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_extbgr_ycc_convert_sse2;
- mmxfct=jsimd_extbgr_ycc_convert_mmx;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_extbgrx_ycc_convert_sse2;
- mmxfct=jsimd_extbgrx_ycc_convert_mmx;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_extxbgr_ycc_convert_sse2;
- mmxfct=jsimd_extxbgr_ycc_convert_mmx;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_extxrgb_ycc_convert_sse2;
- mmxfct=jsimd_extxrgb_ycc_convert_mmx;
- break;
- default:
- sse2fct=jsimd_rgb_ycc_convert_sse2;
- mmxfct=jsimd_rgb_ycc_convert_mmx;
- break;
- }
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
- sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
- else if (simd_support & JSIMD_MMX)
- mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
- void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_extrgb_gray_convert_sse2;
- mmxfct=jsimd_extrgb_gray_convert_mmx;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_extrgbx_gray_convert_sse2;
- mmxfct=jsimd_extrgbx_gray_convert_mmx;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_extbgr_gray_convert_sse2;
- mmxfct=jsimd_extbgr_gray_convert_mmx;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_extbgrx_gray_convert_sse2;
- mmxfct=jsimd_extbgrx_gray_convert_mmx;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_extxbgr_gray_convert_sse2;
- mmxfct=jsimd_extxbgr_gray_convert_mmx;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_extxrgb_gray_convert_sse2;
- mmxfct=jsimd_extxrgb_gray_convert_mmx;
- break;
- default:
- sse2fct=jsimd_rgb_gray_convert_sse2;
- mmxfct=jsimd_rgb_gray_convert_mmx;
- break;
- }
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
- sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
- else if (simd_support & JSIMD_MMX)
- mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
- void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
- void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_ycc_extrgb_convert_sse2;
- mmxfct=jsimd_ycc_extrgb_convert_mmx;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_ycc_extrgbx_convert_sse2;
- mmxfct=jsimd_ycc_extrgbx_convert_mmx;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_ycc_extbgr_convert_sse2;
- mmxfct=jsimd_ycc_extbgr_convert_mmx;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_ycc_extbgrx_convert_sse2;
- mmxfct=jsimd_ycc_extbgrx_convert_mmx;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_ycc_extxbgr_convert_sse2;
- mmxfct=jsimd_ycc_extxbgr_convert_mmx;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_ycc_extxrgb_convert_sse2;
- mmxfct=jsimd_ycc_extxrgb_convert_mmx;
- break;
- default:
- sse2fct=jsimd_ycc_rgb_convert_sse2;
- mmxfct=jsimd_ycc_rgb_convert_mmx;
- break;
- }
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
- sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
- else if (simd_support & JSIMD_MMX)
- mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- if (simd_support & JSIMD_SSE2)
- jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor,
- compptr->width_in_blocks, input_data,
- output_data);
- else if (simd_support & JSIMD_MMX)
- jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor, compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- if (simd_support & JSIMD_SSE2)
- jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor,
- compptr->width_in_blocks, input_data,
- output_data);
- else if (simd_support & JSIMD_MMX)
- jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor, compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- if (simd_support & JSIMD_SSE2)
- jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
- else if (simd_support & JSIMD_MMX)
- jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- if (simd_support & JSIMD_SSE2)
- jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
- else if (simd_support & JSIMD_MMX)
- jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
- jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
- else if (simd_support & JSIMD_MMX)
- jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
- jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
- else if (simd_support & JSIMD_MMX)
- jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
- void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
- mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
- mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
- mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
- mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
- mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
- mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
- break;
- default:
- sse2fct=jsimd_h2v2_merged_upsample_sse2;
- mmxfct=jsimd_h2v2_merged_upsample_mmx;
- break;
- }
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
- sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
- else if (simd_support & JSIMD_MMX)
- mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
- void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
- mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
- mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
- mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
- mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
- mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
- mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
- break;
- default:
- sse2fct=jsimd_h2v1_merged_upsample_sse2;
- mmxfct=jsimd_h2v1_merged_upsample_mmx;
- break;
- }
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
- sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
- else if (simd_support & JSIMD_MMX)
- mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(FAST_FLOAT) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
- if (simd_support & JSIMD_SSE)
- return 1;
- if (simd_support & JSIMD_3DNOW)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
- if (simd_support & JSIMD_SSE2)
- jsimd_convsamp_sse2(sample_data, start_col, workspace);
- else if (simd_support & JSIMD_MMX)
- jsimd_convsamp_mmx(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
-{
- if (simd_support & JSIMD_SSE2)
- jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
- else if (simd_support & JSIMD_SSE)
- jsimd_convsamp_float_sse(sample_data, start_col, workspace);
- else if (simd_support & JSIMD_3DNOW)
- jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(FAST_FLOAT) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
- return 1;
- if (simd_support & JSIMD_3DNOW)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
- jsimd_fdct_islow_sse2(data);
- else if (simd_support & JSIMD_MMX)
- jsimd_fdct_islow_mmx(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
- jsimd_fdct_ifast_sse2(data);
- else if (simd_support & JSIMD_MMX)
- jsimd_fdct_ifast_mmx(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
- if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
- jsimd_fdct_float_sse(data);
- else if (simd_support & JSIMD_3DNOW)
- jsimd_fdct_float_3dnow(data);
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(FAST_FLOAT) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
- if (simd_support & JSIMD_SSE)
- return 1;
- if (simd_support & JSIMD_3DNOW)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
-{
- if (simd_support & JSIMD_SSE2)
- jsimd_quantize_sse2(coef_block, divisors, workspace);
- else if (simd_support & JSIMD_MMX)
- jsimd_quantize_mmx(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
-{
- if (simd_support & JSIMD_SSE2)
- jsimd_quantize_float_sse2(coef_block, divisors, workspace);
- else if (simd_support & JSIMD_SSE)
- jsimd_quantize_float_sse(coef_block, divisors, workspace);
- else if (simd_support & JSIMD_3DNOW)
- jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
- jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
- output_col);
- else if (simd_support & JSIMD_MMX)
- jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
- jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
- output_col);
- else if (simd_support & JSIMD_MMX)
- jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(IFAST_MULT_TYPE) != 2)
- return 0;
- if (IFAST_SCALE_BITS != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
- return 1;
- if (simd_support & JSIMD_MMX)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
- init_simd();
-
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(FAST_FLOAT) != 4)
- return 0;
- if (sizeof(FLOAT_MULT_TYPE) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
- return 1;
- if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
- return 1;
- if (simd_support & JSIMD_3DNOW)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
- jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
- output_col);
- else if (simd_support & JSIMD_MMX)
- jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
- jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
- output_col);
- else if (simd_support & JSIMD_MMX)
- jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
- jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
- output_col);
- else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
- jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
- output_col);
- else if (simd_support & JSIMD_3DNOW)
- jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
- init_simd();
-
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && simd_huffman &&
- IS_ALIGNED_SSE(jconst_huff_encode_one_block))
- return 1;
-
- return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
-{
- return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
- dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimd_mips.c b/media/libjpeg/simd/jsimd_mips.c
deleted file mode 100644
index 63b8115d16..0000000000
--- a/media/libjpeg/simd/jsimd_mips.c
+++ /dev/null
@@ -1,1138 +0,0 @@
-/*
- * jsimd_mips.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * MIPS architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-
-#if defined(__linux__)
-
-LOCAL(int)
-parse_proc_cpuinfo(const char* search_string)
-{
- const char* file_name = "/proc/cpuinfo";
- char cpuinfo_line[256];
- FILE* f = NULL;
- simd_support = 0;
-
- if ((f = fopen(file_name, "r")) != NULL) {
- while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
- if (strstr(cpuinfo_line, search_string) != NULL) {
- fclose(f);
- simd_support |= JSIMD_MIPS_DSPR2;
- return 1;
- }
- }
- fclose(f);
- }
- /* Did not find string in the proc file, or not Linux ELF. */
- return 0;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
- if (simd_support != ~0U)
- return;
-
- simd_support = 0;
-
-#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
- simd_support |= JSIMD_MIPS_DSPR2;
-#elif defined(__linux__)
- /* We still have a chance to use MIPS DSPR2 regardless of globally used
- * -mdspr2 options passed to gcc by performing runtime detection via
- * /proc/cpuinfo parsing on linux */
- if (!parse_proc_cpuinfo("MIPS 74K"))
- return;
-#endif
-
- /* Force different settings through environment variables */
- env = getenv("JSIMD_FORCEDSPR2");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = JSIMD_MIPS_DSPR2;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = 0;
-}
-
-static const int mips_idct_ifast_coefs[4] = {
- 0x45404540, // FIX( 1.082392200 / 2) = 17734 = 0x4546
- 0x5A805A80, // FIX( 1.414213562 / 2) = 23170 = 0x5A82
- 0x76407640, // FIX( 1.847759065 / 2) = 30274 = 0x7642
- 0xAC60AC60 // FIX(-2.613125930 / 4) = -21407 = 0xAC61
-};
-
-/* The following struct is borrowed from jdsample.c */
-typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
-
-typedef struct {
- struct jpeg_upsampler pub;
- JSAMPARRAY color_buf[MAX_COMPONENTS];
- upsample1_ptr methods[MAX_COMPONENTS];
- int next_row_out;
- JDIMENSION rows_to_go;
- int rowgroup_height[MAX_COMPONENTS];
- UINT8 h_expand[MAX_COMPONENTS];
- UINT8 v_expand[MAX_COMPONENTS];
-} my_upsampler;
-
-typedef my_upsampler *my_upsample_ptr;
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_c_can_null_convert (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
- break;
- case JCS_EXT_BGR:
- mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
-
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
- break;
- default:
- mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
- break;
- }
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
- num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- mipsdspr2fct=jsimd_extrgbx_gray_convert_mips_dspr2;
- break;
- case JCS_EXT_BGR:
- mipsdspr2fct=jsimd_extbgr_gray_convert_mips_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- mipsdspr2fct=jsimd_extbgrx_gray_convert_mips_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- mipsdspr2fct=jsimd_extxbgr_gray_convert_mips_dspr2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- mipsdspr2fct=jsimd_extxrgb_gray_convert_mips_dspr2;
- break;
- default:
- mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
- break;
- }
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
- num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
- void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
- break;
- case JCS_EXT_BGR:
- mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
- break;
- default:
- mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
- break;
- }
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- mipsdspr2fct(cinfo->output_width, input_buf, input_row, output_buf,
- num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_c_null_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_c_null_convert_mips_dspr2(cinfo->image_width, input_buf,
- output_buf, output_row, num_rows,
- cinfo->num_components);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if(DCTSIZE != 8)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_h2v2_downsample_mips_dspr2(cinfo->image_width,
- cinfo->max_v_samp_factor,
- compptr->v_samp_factor,
- compptr->width_in_blocks, input_data,
- output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data,
- compptr->v_samp_factor,
- cinfo->max_v_samp_factor,
- cinfo->smoothing_factor,
- compptr->width_in_blocks,
- cinfo->image_width);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_h2v1_downsample_mips_dspr2(cinfo->image_width,
- cinfo->max_v_samp_factor,
- compptr->v_samp_factor,
- compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_int_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor,
- cinfo->output_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor,
- cinfo->output_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-
- jsimd_int_upsample_mips_dspr2(upsample->h_expand[compptr->component_index],
- upsample->v_expand[compptr->component_index],
- input_data, output_data_ptr,
- cinfo->output_width,
- cinfo->max_v_samp_factor);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
- compptr->downsampled_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
- compptr->downsampled_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
- init_simd();
-
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
- init_simd();
-
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
- JSAMPLE *);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- mipsdspr2fct=jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_BGR:
- mipsdspr2fct=jsimd_h2v2_extbgr_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- mipsdspr2fct=jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- mipsdspr2fct=jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- mipsdspr2fct=jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2;
- break;
- default:
- mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
- break;
- }
-
- mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
- cinfo->sample_range_limit);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
- JSAMPLE *);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- mipsdspr2fct=jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_BGR:
- mipsdspr2fct=jsimd_h2v1_extbgr_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- mipsdspr2fct=jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- mipsdspr2fct=jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- mipsdspr2fct=jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2;
- break;
- default:
- mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
- break;
- }
-
- mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
- cinfo->sample_range_limit);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_convsamp_mips_dspr2(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
-{
- if ((simd_support & JSIMD_MIPS_DSPR2))
- jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_fdct_islow_mips_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_fdct_ifast_mips_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_quantize_mips_dspr2(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_6x6 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_12x12 (void)
-{
- init_simd();
-
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_idct_2x2_mips_dspr2(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if (simd_support & JSIMD_MIPS_DSPR2) {
- int workspace[DCTSIZE*4]; /* buffers data between passes */
- jsimd_idct_4x4_mips_dspr2(compptr->dct_table, coef_block, output_buf,
- output_col, workspace);
- }
-}
-
-GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if (simd_support & JSIMD_MIPS_DSPR2)
- jsimd_idct_6x6_mips_dspr2(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
-{
- if (simd_support & JSIMD_MIPS_DSPR2) {
- int workspace[96];
- int output[12] = {
- (int)(output_buf[0] + output_col),
- (int)(output_buf[1] + output_col),
- (int)(output_buf[2] + output_col),
- (int)(output_buf[3] + output_col),
- (int)(output_buf[4] + output_col),
- (int)(output_buf[5] + output_col),
- (int)(output_buf[6] + output_col),
- (int)(output_buf[7] + output_col),
- (int)(output_buf[8] + output_col),
- (int)(output_buf[9] + output_col),
- (int)(output_buf[10] + output_col),
- (int)(output_buf[11] + output_col),
- };
- jsimd_idct_12x12_pass1_mips_dspr2(coef_block, compptr->dct_table,
- workspace);
- jsimd_idct_12x12_pass2_mips_dspr2(workspace, output);
- }
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(IFAST_MULT_TYPE) != 2)
- return 0;
- if (IFAST_SCALE_BITS != 2)
- return 0;
-
- if (simd_support & JSIMD_MIPS_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
- init_simd();
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if (simd_support & JSIMD_MIPS_DSPR2) {
- int output[8] = {
- (int)(output_buf[0] + output_col),
- (int)(output_buf[1] + output_col),
- (int)(output_buf[2] + output_col),
- (int)(output_buf[3] + output_col),
- (int)(output_buf[4] + output_col),
- (int)(output_buf[5] + output_col),
- (int)(output_buf[6] + output_col),
- (int)(output_buf[7] + output_col),
- };
-
- jsimd_idct_islow_mips_dspr2(coef_block, compptr->dct_table,
- output, IDCT_range_limit(cinfo));
- }
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- if (simd_support & JSIMD_MIPS_DSPR2) {
- JCOEFPTR inptr;
- IFAST_MULT_TYPE *quantptr;
- DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */
-
- /* Pass 1: process columns from input, store into work array. */
-
- inptr = coef_block;
- quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
-
- jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr,
- workspace, mips_idct_ifast_coefs);
-
- /* Pass 2: process rows from work array, store into output array. */
- /* Note that we must descale the results by a factor of 8 == 2**3, */
- /* and also undo the PASS1_BITS scaling. */
-
- jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf,
- output_col, mips_idct_ifast_coefs);
- }
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
- return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
-{
- return NULL;
-}
diff --git a/media/libjpeg/simd/jsimd_mips_dspr2.S b/media/libjpeg/simd/jsimd_mips_dspr2.S
deleted file mode 100644
index c8c286cb3e..0000000000
--- a/media/libjpeg/simd/jsimd_mips_dspr2.S
+++ /dev/null
@@ -1,4487 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * All Rights Reserved.
- * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
- * Darko Laus (darko.laus@imgtec.com)
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#include "jsimd_mips_dspr2_asm.h"
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
-/*
- * a0 - cinfo->image_width
- * a1 - input_buf
- * a2 - output_buf
- * a3 - output_row
- * 16(sp) - num_rows
- * 20(sp) - cinfo->num_components
- *
- * Null conversion for compression
- */
-
- SAVE_REGS_ON_STACK 8, s0, s1
-
- lw t9, 24(sp) // t9 = num_rows
- lw s0, 28(sp) // s0 = cinfo->num_components
- andi t0, a0, 3 // t0 = cinfo->image_width & 3
- beqz t0, 4f // no residual
- nop
-0:
- addiu t9, t9, -1
- bltz t9, 7f
- li t1, 0
-1:
- sll t3, t1, 2
- lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
- lw t2, 0(a1) // t2 = inptr = *input_buf
- sll t4, a3, 2
- lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
- addu t2, t2, t1
- addu s1, t5, a0
- addu t6, t5, t0
-2:
- lbu t3, 0(t2)
- addiu t5, t5, 1
- sb t3, -1(t5)
- bne t6, t5, 2b
- addu t2, t2, s0
-3:
- lbu t3, 0(t2)
- addu t4, t2, s0
- addu t7, t4, s0
- addu t8, t7, s0
- addu t2, t8, s0
- lbu t4, 0(t4)
- lbu t7, 0(t7)
- lbu t8, 0(t8)
- addiu t5, t5, 4
- sb t3, -4(t5)
- sb t4, -3(t5)
- sb t7, -2(t5)
- bne s1, t5, 3b
- sb t8, -1(t5)
- addiu t1, t1, 1
- bne t1, s0, 1b
- nop
- addiu a1, a1, 4
- bgez t9, 0b
- addiu a3, a3, 1
- b 7f
- nop
-4:
- addiu t9, t9, -1
- bltz t9, 7f
- li t1, 0
-5:
- sll t3, t1, 2
- lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
- lw t2, 0(a1) // t2 = inptr = *input_buf
- sll t4, a3, 2
- lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
- addu t2, t2, t1
- addu s1, t5, a0
- addu t6, t5, t0
-6:
- lbu t3, 0(t2)
- addu t4, t2, s0
- addu t7, t4, s0
- addu t8, t7, s0
- addu t2, t8, s0
- lbu t4, 0(t4)
- lbu t7, 0(t7)
- lbu t8, 0(t8)
- addiu t5, t5, 4
- sb t3, -4(t5)
- sb t4, -3(t5)
- sb t7, -2(t5)
- bne s1, t5, 6b
- sb t8, -1(t5)
- addiu t1, t1, 1
- bne t1, s0, 5b
- nop
- addiu a1, a1, 4
- bgez t9, 4b
- addiu a3, a3, 1
-7:
- RESTORE_REGS_FROM_STACK 8, s0, s1
-
- j ra
- nop
-
-END(jsimd_c_null_convert_mips_dspr2)
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_ycc_convert_mips_dspr2
- * jsimd_extbgr_ycc_convert_mips_dspr2
- * jsimd_extrgbx_ycc_convert_mips_dspr2
- * jsimd_extbgrx_ycc_convert_mips_dspr2
- * jsimd_extxbgr_ycc_convert_mips_dspr2
- * jsimd_extxrgb_ycc_convert_mips_dspr2
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_YCC r, \
- g, \
- b, \
- inptr
- lbu \r, \r_offs(\inptr)
- lbu \g, \g_offs(\inptr)
- lbu \b, \b_offs(\inptr)
- addiu \inptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
-/*
- * a0 - cinfo->image_width
- * a1 - input_buf
- * a2 - output_buf
- * a3 - output_row
- * 16(sp) - num_rows
- */
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- lw t7, 48(sp) // t7 = num_rows
- li s0, 0x4c8b // FIX(0.29900)
- li s1, 0x9646 // FIX(0.58700)
- li s2, 0x1d2f // FIX(0.11400)
- li s3, 0xffffd4cd // -FIX(0.16874)
- li s4, 0xffffab33 // -FIX(0.33126)
- li s5, 0x8000 // FIX(0.50000)
- li s6, 0xffff94d1 // -FIX(0.41869)
- li s7, 0xffffeb2f // -FIX(0.08131)
- li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
-
-0:
- addiu t7, -1 // --num_rows
- lw t6, 0(a1) // t6 = input_buf[0]
- lw t0, 0(a2)
- lw t1, 4(a2)
- lw t2, 8(a2)
- sll t3, a3, 2
- lwx t0, t3(t0) // t0 = output_buf[0][output_row]
- lwx t1, t3(t1) // t1 = output_buf[1][output_row]
- lwx t2, t3(t2) // t2 = output_buf[2][output_row]
-
- addu t9, t2, a0 // t9 = end address
- addiu a3, 1
-
-1:
- DO_RGB_TO_YCC t3, t4, t5, t6
-
- mtlo s5, $ac0
- mtlo t8, $ac1
- mtlo t8, $ac2
- maddu $ac0, s2, t5
- maddu $ac1, s5, t5
- maddu $ac2, s5, t3
- maddu $ac0, s0, t3
- maddu $ac1, s3, t3
- maddu $ac2, s6, t4
- maddu $ac0, s1, t4
- maddu $ac1, s4, t4
- maddu $ac2, s7, t5
- extr.w t3, $ac0, 16
- extr.w t4, $ac1, 16
- extr.w t5, $ac2, 16
- sb t3, 0(t0)
- sb t4, 0(t1)
- sb t5, 0(t2)
- addiu t0, 1
- addiu t2, 1
- bne t2, t9, 1b
- addiu t1, 1
- bgtz t7, 0b
- addiu a1, 4
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
-
-.purgem DO_RGB_TO_YCC
-
-.endm
-
-/*------------------------------------------id -- pix R G B */
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
-
-/*****************************************************************************/
-/*
- * jsimd_ycc_extrgb_convert_mips_dspr2
- * jsimd_ycc_extbgr_convert_mips_dspr2
- * jsimd_ycc_extrgbx_convert_mips_dspr2
- * jsimd_ycc_extbgrx_convert_mips_dspr2
- * jsimd_ycc_extxbgr_convert_mips_dspr2
- * jsimd_ycc_extxrgb_convert_mips_dspr2
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
-
-.macro STORE_YCC_TO_RGB scratch0 \
- scratch1 \
- scratch2 \
- outptr
- sb \scratch0, \r_offs(\outptr)
- sb \scratch1, \g_offs(\outptr)
- sb \scratch2, \b_offs(\outptr)
-.if (\pixel_size == 4)
- li t0, 0xFF
- sb t0, \a_offs(\outptr)
-.endif
- addiu \outptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
-/*
- * a0 - cinfo->image_width
- * a1 - input_buf
- * a2 - input_row
- * a3 - output_buf
- * 16(sp) - num_rows
- */
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- lw s1, 48(sp)
- li t3, 0x8000
- li t4, 0x166e9 // FIX(1.40200)
- li t5, 0x1c5a2 // FIX(1.77200)
- li t6, 0xffff492e // -FIX(0.71414)
- li t7, 0xffffa7e6 // -FIX(0.34414)
- repl.ph t8, 128
-
-0:
- lw s0, 0(a3)
- lw t0, 0(a1)
- lw t1, 4(a1)
- lw t2, 8(a1)
- sll s5, a2, 2
- addiu s1, -1
- lwx s2, s5(t0)
- lwx s3, s5(t1)
- lwx s4, s5(t2)
- addu t9, s2, a0
- addiu a2, 1
-
-1:
- lbu s7, 0(s4) // cr
- lbu s6, 0(s3) // cb
- lbu s5, 0(s2) // y
- addiu s2, 1
- addiu s4, 1
- addiu s7, -128
- addiu s6, -128
- mul t2, t7, s6
- mul t0, t6, s7 // Crgtab[cr]
- sll s7, 15
- mulq_rs.w t1, t4, s7 // Crrtab[cr]
- sll s6, 15
- addu t2, t3 // Cbgtab[cb]
- addu t2, t0
-
- mulq_rs.w t0, t5, s6 // Cbbtab[cb]
- sra t2, 16
- addu t1, s5
- addu t2, s5 // add y
- ins t2, t1, 16, 16
- subu.ph t2, t2, t8
- addu t0, s5
- shll_s.ph t2, t2, 8
- subu t0, 128
- shra.ph t2, t2, 8
- shll_s.w t0, t0, 24
- addu.ph t2, t2, t8 // clip & store
- sra t0, t0, 24
- sra t1, t2, 16
- addiu t0, 128
-
- STORE_YCC_TO_RGB t1, t2, t0, s0
-
- bne s2, t9, 1b
- addiu s3, 1
- bgtz s1, 0b
- addiu a3, 4
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
-
-.purgem STORE_YCC_TO_RGB
-
-.endm
-
-/*------------------------------------------id -- pix R G B A */
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_gray_convert_mips_dspr2
- * jsimd_extbgr_gray_convert_mips_dspr2
- * jsimd_extrgbx_gray_convert_mips_dspr2
- * jsimd_extbgrx_gray_convert_mips_dspr2
- * jsimd_extxbgr_gray_convert_mips_dspr2
- * jsimd_extxrgb_gray_convert_mips_dspr2
- *
- * Colorspace conversion RGB -> GRAY
- */
-
-.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_GRAY r, \
- g, \
- b, \
- inptr
- lbu \r, \r_offs(\inptr)
- lbu \g, \g_offs(\inptr)
- lbu \b, \b_offs(\inptr)
- addiu \inptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
-/*
- * a0 - cinfo->image_width
- * a1 - input_buf
- * a2 - output_buf
- * a3 - output_row
- * 16(sp) - num_rows
- */
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- li s0, 0x4c8b // s0 = FIX(0.29900)
- li s1, 0x9646 // s1 = FIX(0.58700)
- li s2, 0x1d2f // s2 = FIX(0.11400)
- li s7, 0x8000 // s7 = FIX(0.50000)
- lw s6, 48(sp)
- andi t7, a0, 3
-
-0:
- addiu s6, -1 // s6 = num_rows
- lw t0, 0(a1)
- lw t1, 0(a2)
- sll t3, a3, 2
- lwx t1, t3(t1)
- addiu a3, 1
- addu t9, t1, a0
- subu t8, t9, t7
- beq t1, t8, 2f
- nop
-
-1:
- DO_RGB_TO_GRAY t3, t4, t5, t0
- DO_RGB_TO_GRAY s3, s4, s5, t0
-
- mtlo s7, $ac0
- maddu $ac0, s2, t5
- maddu $ac0, s1, t4
- maddu $ac0, s0, t3
- mtlo s7, $ac1
- maddu $ac1, s2, s5
- maddu $ac1, s1, s4
- maddu $ac1, s0, s3
- extr.w t6, $ac0, 16
-
- DO_RGB_TO_GRAY t3, t4, t5, t0
- DO_RGB_TO_GRAY s3, s4, s5, t0
-
- mtlo s7, $ac0
- maddu $ac0, s2, t5
- maddu $ac0, s1, t4
- extr.w t2, $ac1, 16
- maddu $ac0, s0, t3
- mtlo s7, $ac1
- maddu $ac1, s2, s5
- maddu $ac1, s1, s4
- maddu $ac1, s0, s3
- extr.w t5, $ac0, 16
- sb t6, 0(t1)
- sb t2, 1(t1)
- extr.w t3, $ac1, 16
- addiu t1, 4
- sb t5, -2(t1)
- sb t3, -1(t1)
- bne t1, t8, 1b
- nop
-
-2:
- beqz t7, 4f
- nop
-
-3:
- DO_RGB_TO_GRAY t3, t4, t5, t0
-
- mtlo s7, $ac0
- maddu $ac0, s2, t5
- maddu $ac0, s1, t4
- maddu $ac0, s0, t3
- extr.w t6, $ac0, 16
- sb t6, 0(t1)
- addiu t1, 1
- bne t1, t9, 3b
- nop
-
-4:
- bgtz s6, 0b
- addiu a1, 4
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_\colorid\()_gray_convert_mips_dspr2)
-
-.purgem DO_RGB_TO_GRAY
-
-.endm
-
-/*------------------------------------------id -- pix R G B */
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
-/*****************************************************************************/
-/*
- * jsimd_h2v2_merged_upsample_mips_dspr2
- * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
- * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
- * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
- * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
- * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
- * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
- *
- * Merged h2v2 upsample routines
- */
-.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
- pixel_size, \
- r1_offs, \
- g1_offs, \
- b1_offs, \
- a1_offs, \
- r2_offs, \
- g2_offs, \
- b2_offs, \
- a2_offs
-
-.macro STORE_H2V2_2_PIXELS scratch0 \
- scratch1 \
- scratch2 \
- scratch3 \
- scratch4 \
- scratch5 \
- outptr
- sb \scratch0, \r1_offs(\outptr)
- sb \scratch1, \g1_offs(\outptr)
- sb \scratch2, \b1_offs(\outptr)
- sb \scratch3, \r2_offs(\outptr)
- sb \scratch4, \g2_offs(\outptr)
- sb \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
- li \scratch0, 0xFF
- sb \scratch0, \a1_offs(\outptr)
- sb \scratch0, \a2_offs(\outptr)
-.endif
- addiu \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V2_1_PIXEL scratch0 \
- scratch1 \
- scratch2 \
- outptr
- sb \scratch0, \r1_offs(\outptr)
- sb \scratch1, \g1_offs(\outptr)
- sb \scratch2, \b1_offs(\outptr)
-
-.if (\pixel_size == 8)
- li t0, 0xFF
- sb t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
-/*
- * a0 - cinfo->output_width
- * a1 - input_buf
- * a2 - in_row_group_ctr
- * a3 - output_buf
- * 16(sp) - cinfo->sample_range_limit
- */
-
- SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
- lw t9, 56(sp) // cinfo->sample_range_limit
- lw v0, 0(a1)
- lw v1, 4(a1)
- lw t0, 8(a1)
- sll t1, a2, 3
- addiu t2, t1, 4
- sll t3, a2, 2
- lw t4, 0(a3) // t4 = output_buf[0]
- lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
- lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
- lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
- lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
- lw t7, 4(a3) // t7 = output_buf[1]
- li s1, 0xe6ea
- addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
- addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
- addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
- xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
- srl t3, a0, 1
- blez t3, 2f
- addu t0, t5, t3 // t0 = end address
- 1:
- lbu t3, 0(t5)
- lbu s3, 0(t6)
- addiu t5, t5, 1
- addiu t3, t3, -128 // (cb - 128)
- addiu s3, s3, -128 // (cr - 128)
- mult $ac1, s1, t3
- madd $ac1, s2, s3
- sll s3, s3, 15
- sll t3, t3, 15
- mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
- extr_r.w s5, $ac1, 16
- mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
- lbu v0, 0(t1)
- addiu t6, t6, 1
- addiu t1, t1, 2
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu AT, 0(t3)
- lbu s7, 0(s3)
- lbu ra, 0(v1)
- lbu v0, -1(t1)
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu t3, 0(t3)
- lbu s3, 0(s3)
- lbu v1, 0(v1)
- lbu v0, 0(t2)
-
- STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
-
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu AT, 0(t3)
- lbu s7, 0(s3)
- lbu ra, 0(v1)
- lbu v0, 1(t2)
- addiu t2, t2, 2
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu t3, 0(t3)
- lbu s3, 0(s3)
- lbu v1, 0(v1)
-
- STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
-
- bne t0, t5, 1b
- nop
-2:
- andi t0, a0, 1
- beqz t0, 4f
- lbu t3, 0(t5)
- lbu s3, 0(t6)
- addiu t3, t3, -128 // (cb - 128)
- addiu s3, s3, -128 // (cr - 128)
- mult $ac1, s1, t3
- madd $ac1, s2, s3
- sll s3, s3, 15
- sll t3, t3, 15
- lbu v0, 0(t1)
- extr_r.w s5, $ac1, 16
- mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
- mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu t3, 0(t3)
- lbu s3, 0(s3)
- lbu v1, 0(v1)
- lbu v0, 0(t2)
-
- STORE_H2V2_1_PIXEL t3, s3, v1, t4
-
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu t3, 0(t3)
- lbu s3, 0(s3)
- lbu v1, 0(v1)
-
- STORE_H2V2_1_PIXEL t3, s3, v1, t7
-4:
- RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
- j ra
- nop
-
-END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
-
-.purgem STORE_H2V2_1_PIXEL
-.purgem STORE_H2V2_2_PIXELS
-.endm
-
-/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-/*****************************************************************************/
-/*
- * jsimd_h2v1_merged_upsample_mips_dspr2
- * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
- * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
- * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
- * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
- * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
- * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
- *
- * Merged h2v1 upsample routines
- */
-
-.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
- pixel_size, \
- r1_offs, \
- g1_offs, \
- b1_offs, \
- a1_offs, \
- r2_offs, \
- g2_offs, \
- b2_offs, \
- a2_offs
-
-.macro STORE_H2V1_2_PIXELS scratch0 \
- scratch1 \
- scratch2 \
- scratch3 \
- scratch4 \
- scratch5 \
- outptr
- sb \scratch0, \r1_offs(\outptr)
- sb \scratch1, \g1_offs(\outptr)
- sb \scratch2, \b1_offs(\outptr)
- sb \scratch3, \r2_offs(\outptr)
- sb \scratch4, \g2_offs(\outptr)
- sb \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
- li t0, 0xFF
- sb t0, \a1_offs(\outptr)
- sb t0, \a2_offs(\outptr)
-.endif
- addiu \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V1_1_PIXEL scratch0 \
- scratch1 \
- scratch2 \
- outptr
- sb \scratch0, \r1_offs(\outptr)
- sb \scratch1, \g1_offs(\outptr)
- sb \scratch2, \b1_offs(\outptr)
-.if (\pixel_size == 8)
- li t0, 0xFF
- sb t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
-/*
- * a0 - cinfo->output_width
- * a1 - input_buf
- * a2 - in_row_group_ctr
- * a3 - output_buf
- * 16(sp) - range_limit
- */
-
- SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
- li t0, 0xe6ea
- lw t1, 0(a1) // t1 = input_buf[0]
- lw t2, 4(a1) // t2 = input_buf[1]
- lw t3, 8(a1) // t3 = input_buf[2]
- lw t8, 56(sp) // t8 = range_limit
- addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
- addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
- addiu s0, t0, 0x9916 // s0 = 0x8000
- addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
- xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
- srl t0, a0, 1
- sll t4, a2, 2
- lwx s5, t4(t1) // s5 = inptr0
- lwx s6, t4(t2) // s6 = inptr1
- lwx s7, t4(t3) // s7 = inptr2
- lw t7, 0(a3) // t7 = outptr
- blez t0, 2f
- addu t9, s6, t0 // t9 = end address
-1:
- lbu t2, 0(s6) // t2 = cb
- lbu t0, 0(s7) // t0 = cr
- lbu t1, 0(s5) // t1 = y
- addiu t2, t2, -128 // t2 = cb - 128
- addiu t0, t0, -128 // t0 = cr - 128
- mult $ac1, s4, t2
- madd $ac1, s3, t0
- sll t0, t0, 15
- sll t2, t2, 15
- mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
- extr_r.w t5, $ac1, 16
- mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
- addiu s7, s7, 1
- addiu s6, s6, 1
- addu t2, t1, t0 // t2 = y + cred
- addu t3, t1, t5 // t3 = y + cgreen
- addu t4, t1, t6 // t4 = y + cblue
- addu t2, t8, t2
- addu t3, t8, t3
- addu t4, t8, t4
- lbu t1, 1(s5)
- lbu v0, 0(t2)
- lbu v1, 0(t3)
- lbu ra, 0(t4)
- addu t2, t1, t0
- addu t3, t1, t5
- addu t4, t1, t6
- addu t2, t8, t2
- addu t3, t8, t3
- addu t4, t8, t4
- lbu t2, 0(t2)
- lbu t3, 0(t3)
- lbu t4, 0(t4)
-
- STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
-
- bne t9, s6, 1b
- addiu s5, s5, 2
-2:
- andi t0, a0, 1
- beqz t0, 4f
- nop
-3:
- lbu t2, 0(s6)
- lbu t0, 0(s7)
- lbu t1, 0(s5)
- addiu t2, t2, -128 //(cb - 128)
- addiu t0, t0, -128 //(cr - 128)
- mul t3, s4, t2
- mul t4, s3, t0
- sll t0, t0, 15
- sll t2, t2, 15
- mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
- mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
- addu t3, t3, s0
- addu t3, t4, t3
- sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
- addu t2, t1, t0 // y + cred
- addu t3, t1, t5 // y + cgreen
- addu t4, t1, t6 // y + cblue
- addu t2, t8, t2
- addu t3, t8, t3
- addu t4, t8, t4
- lbu t2, 0(t2)
- lbu t3, 0(t3)
- lbu t4, 0(t4)
-
- STORE_H2V1_1_PIXEL t2, t3, t4, t7
-4:
- RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
- j ra
- nop
-
-END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
-
-.purgem STORE_H2V1_1_PIXEL
-.purgem STORE_H2V1_2_PIXELS
-.endm
-
-/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-/*****************************************************************************/
-/*
- * jsimd_h2v2_fancy_upsample_mips_dspr2
- *
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- */
-LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
-/*
- * a0 - cinfo->max_v_samp_factor
- * a1 - downsampled_width
- * a2 - input_data
- * a3 - output_data_ptr
- */
-
- SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
- li s4, 0
- lw s2, 0(a3) // s2 = *output_data_ptr
-0:
- li t9, 2
- lw s1, -4(a2) // s1 = inptr1
-
-1:
- lw s0, 0(a2) // s0 = inptr0
- lwx s3, s4(s2)
- addiu s5, a1, -2 // s5 = downsampled_width - 2
- srl t4, s5, 1
- sll t4, t4, 1
- lbu t0, 0(s0)
- lbu t1, 1(s0)
- lbu t2, 0(s1)
- lbu t3, 1(s1)
- addiu s0, 2
- addiu s1, 2
- addu t8, s0, t4 // t8 = end address
- andi s5, s5, 1 // s5 = residual
- sll t4, t0, 1
- sll t6, t1, 1
- addu t0, t0, t4 // t0 = (*inptr0++) * 3
- addu t1, t1, t6 // t1 = (*inptr0++) * 3
- addu t7, t0, t2 // t7 = thiscolsum
- addu t6, t1, t3 // t5 = nextcolsum
- sll t0, t7, 2 // t0 = thiscolsum * 4
- subu t1, t0, t7 // t1 = thiscolsum * 3
- shra_r.w t0, t0, 4
- addiu t1, 7
- addu t1, t1, t6
- srl t1, t1, 4
- sb t0, 0(s3)
- sb t1, 1(s3)
- beq t8, s0, 22f // skip to final iteration if width == 3
- addiu s3, 2
-2:
- lh t0, 0(s0) // t0 = A3|A2
- lh t2, 0(s1) // t2 = B3|B2
- addiu s0, 2
- addiu s1, 2
- preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
- preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
- shll.ph t1, t0, 1
- sll t3, t6, 1
- addu.ph t0, t1, t0 // t0 = A3*3|A2*3
- addu t3, t3, t6 // t3 = this * 3
- addu.ph t0, t0, t2 // t0 = next2|next1
- addu t1, t3, t7
- andi t7, t0, 0xFFFF // t7 = next1
- sll t2, t7, 1
- addu t2, t7, t2 // t2 = next1*3
- addu t4, t2, t6
- srl t6, t0, 16 // t6 = next2
- shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
- addu t0, t3, t7
- addiu t0, 7
- srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
- shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
- addu t2, t2, t6
- addiu t2, 7
- srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
- sb t1, 0(s3)
- sb t0, 1(s3)
- sb t4, 2(s3)
- sb t2, 3(s3)
- bne t8, s0, 2b
- addiu s3, 4
-22:
- beqz s5, 4f
- addu t8, s0, s5
-3:
- lbu t0, 0(s0)
- lbu t2, 0(s1)
- addiu s0, 1
- addiu s1, 1
- sll t3, t6, 1
- sll t1, t0, 1
- addu t1, t0, t1 // t1 = inptr0 * 3
- addu t3, t3, t6 // t3 = thiscolsum * 3
- addu t5, t1, t2
- addu t1, t3, t7
- shra_r.w t1, t1, 4
- addu t0, t3, t5
- addiu t0, 7
- srl t0, t0, 4
- sb t1, 0(s3)
- sb t0, 1(s3)
- addiu s3, 2
- move t7, t6
- bne t8, s0, 3b
- move t6, t5
-4:
- sll t0, t6, 2 // t0 = thiscolsum * 4
- subu t1, t0, t6 // t1 = thiscolsum * 3
- addu t1, t1, t7
- addiu s4, 4
- shra_r.w t1, t1, 4
- addiu t0, 7
- srl t0, t0, 4
- sb t1, 0(s3)
- sb t0, 1(s3)
- addiu t9, -1
- addiu s3, 2
- bnez t9, 1b
- lw s1, 4(a2)
- srl t0, s4, 2
- subu t0, a0, t0
- bgtz t0, 0b
- addiu a2, 4
-
- RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
- j ra
- nop
-END(jsimd_h2v2_fancy_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
-/*
- * a0 - cinfo->max_v_samp_factor
- * a1 - downsampled_width
- * a2 - input_data
- * a3 - output_data_ptr
- */
-
- SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
- .set at
-
- beqz a0, 3f
- sll t0, a0, 2
- lw s1, 0(a3)
- li s3, 0x10001
- addu s0, s1, t0
-0:
- addiu t8, a1, -2
- srl t9, t8, 2
- lw t7, 0(a2)
- lw s2, 0(s1)
- lbu t0, 0(t7)
- lbu t1, 1(t7) // t1 = inptr[1]
- sll t2, t0, 1
- addu t2, t2, t0 // t2 = invalue*3
- addu t2, t2, t1
- shra_r.w t2, t2, 2
- sb t0, 0(s2)
- sb t2, 1(s2)
- beqz t9, 11f
- addiu s2, 2
-1:
- ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
- ulw t1, 1(t7)
- ulh t2, 4(t7) // t2 = |0|0|P5|P4|
- preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
- preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
- preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
- preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
- preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
- shll.ph t5, t4, 1
- shll.ph t6, t1, 1
- addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
- addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
- addu.ph t4, t3, s3
- addu.ph t0, t0, s3
- addu.ph t4, t4, t5
- addu.ph t0, t0, t6
- shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
- shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
- addu.ph t2, t2, t5
- addu.ph t3, t3, t6
- shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
- shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
- shll.ph t2, t2, 8
- shll.ph t3, t3, 8
- or t2, t4, t2
- or t3, t3, t0
- addiu t9, -1
- usw t3, 0(s2)
- usw t2, 4(s2)
- addiu s2, 8
- bgtz t9, 1b
- addiu t7, 4
-11:
- andi t8, 3
- beqz t8, 22f
- addiu t7, 1
-
-2:
- lbu t0, 0(t7)
- addiu t7, 1
- sll t1, t0, 1
- addu t2, t0, t1 // t2 = invalue
- lbu t3, -2(t7)
- lbu t4, 0(t7)
- addiu t3, 1
- addiu t4, 2
- addu t3, t3, t2
- addu t4, t4, t2
- srl t3, 2
- srl t4, 2
- sb t3, 0(s2)
- sb t4, 1(s2)
- addiu t8, -1
- bgtz t8, 2b
- addiu s2, 2
-
-22:
- lbu t0, 0(t7)
- lbu t2, -1(t7)
- sll t1, t0, 1
- addu t1, t1, t0 // t1 = invalue * 3
- addu t1, t1, t2
- addiu t1, 1
- srl t1, t1, 2
- sb t1, 0(s2)
- sb t0, 1(s2)
- addiu s1, 4
- bne s1, s0, 0b
- addiu a2, 4
-3:
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
- j ra
- nop
-END(jsimd_h2v1_fancy_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
-/*
- * a0 - cinfo->image_width
- * a1 - cinfo->max_v_samp_factor
- * a2 - compptr->v_samp_factor
- * a3 - compptr->width_in_blocks
- * 16(sp) - input_data
- * 20(sp) - output_data
- */
- .set at
-
- SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
-
- beqz a2, 7f
- lw s1, 44(sp) // s1 = output_data
- lw s0, 40(sp) // s0 = input_data
- srl s2, a0, 2
- andi t9, a0, 2
- srl t7, t9, 1
- addu s2, t7, s2
- sll t0, a3, 3 // t0 = width_in_blocks*DCT
- srl t7, t0, 1
- subu s2, t7, s2
-0:
- andi t6, a0, 1 // t6 = temp_index
- addiu t6, -1
- lw t4, 0(s1) // t4 = outptr
- lw t5, 0(s0) // t5 = inptr0
- li s3, 0 // s3 = bias
- srl t7, a0, 1 // t7 = image_width1
- srl s4, t7, 2
- andi t8, t7, 3
-1:
- ulhu t0, 0(t5)
- ulhu t1, 2(t5)
- ulhu t2, 4(t5)
- ulhu t3, 6(t5)
- raddu.w.qb t0, t0
- raddu.w.qb t1, t1
- raddu.w.qb t2, t2
- raddu.w.qb t3, t3
- shra.ph t0, t0, 1
- shra_r.ph t1, t1, 1
- shra.ph t2, t2, 1
- shra_r.ph t3, t3, 1
- sb t0, 0(t4)
- sb t1, 1(t4)
- sb t2, 2(t4)
- sb t3, 3(t4)
- addiu s4, -1
- addiu t4, 4
- bgtz s4, 1b
- addiu t5, 8
- beqz t8, 3f
- addu s4, t4, t8
-2:
- ulhu t0, 0(t5)
- raddu.w.qb t0, t0
- addqh.w t0, t0, s3
- xori s3, s3, 1
- sb t0, 0(t4)
- addiu t4, 1
- bne t4, s4, 2b
- addiu t5, 2
-3:
- lbux t1, t6(t5)
- sll t1, 1
- addqh.w t2, t1, s3 // t2 = pixval1
- xori s3, s3, 1
- addqh.w t3, t1, s3 // t3 = pixval2
- blez s2, 5f
- append t3, t2, 8
- addu t5, t4, s2 // t5 = loop_end2
-4:
- ush t3, 0(t4)
- addiu s2, -1
- bgtz s2, 4b
- addiu t4, 2
-5:
- beqz t9, 6f
- nop
- sb t2, 0(t4)
-6:
- addiu s1, 4
- addiu a2, -1
- bnez a2, 0b
- addiu s0, 4
-7:
- RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
-
- j ra
- nop
-END(jsimd_h2v1_downsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
-
-/*
- * a0 - cinfo->image_width
- * a1 - cinfo->max_v_samp_factor
- * a2 - compptr->v_samp_factor
- * a3 - compptr->width_in_blocks
- * 16(sp) - input_data
- * 20(sp) - output_data
- */
- .set at
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- beqz a2, 8f
- lw s1, 52(sp) // s1 = output_data
- lw s0, 48(sp) // s0 = input_data
-
- andi t6, a0, 1 // t6 = temp_index
- addiu t6, -1
- srl t7, a0, 1 // t7 = image_width1
- srl s4, t7, 2
- andi t8, t7, 3
- andi t9, a0, 2
- srl s2, a0, 2
- srl t7, t9, 1
- addu s2, t7, s2
- sll t0, a3, 3 // s2 = width_in_blocks*DCT
- srl t7, t0, 1
- subu s2, t7, s2
-0:
- lw t4, 0(s1) // t4 = outptr
- lw t5, 0(s0) // t5 = inptr0
- lw s7, 4(s0) // s7 = inptr1
- li s6, 1 // s6 = bias
-2:
- ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
- ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
- ulw t2, 4(t5)
- ulw t3, 4(s7)
- precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
- ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
- raddu.w.qb t1, t7
- raddu.w.qb t0, t0
- shra_r.w t1, t1, 2
- addiu t0, 1
- srl t0, 2
- precrq.ph.w t7, t2, t3
- ins t2, t3, 16, 16
- raddu.w.qb t7, t7
- raddu.w.qb t2, t2
- shra_r.w t7, t7, 2
- addiu t2, 1
- srl t2, 2
- sb t0, 0(t4)
- sb t1, 1(t4)
- sb t2, 2(t4)
- sb t7, 3(t4)
- addiu t4, 4
- addiu t5, 8
- addiu s4, s4, -1
- bgtz s4, 2b
- addiu s7, 8
- beqz t8, 4f
- addu t8, t4, t8
-3:
- ulhu t0, 0(t5)
- ulhu t1, 0(s7)
- ins t0, t1, 16, 16
- raddu.w.qb t0, t0
- addu t0, t0, s6
- srl t0, 2
- xori s6, s6, 3
- sb t0, 0(t4)
- addiu t5, 2
- addiu t4, 1
- bne t8, t4, 3b
- addiu s7, 2
-4:
- lbux t1, t6(t5)
- sll t1, 1
- lbux t0, t6(s7)
- sll t0, 1
- addu t1, t1, t0
- addu t3, t1, s6
- srl t0, t3, 2 // t2 = pixval1
- xori s6, s6, 3
- addu t2, t1, s6
- srl t1, t2, 2 // t3 = pixval2
- blez s2, 6f
- append t1, t0, 8
-5:
- ush t1, 0(t4)
- addiu s2, -1
- bgtz s2, 5b
- addiu t4, 2
-6:
- beqz t9, 7f
- nop
- sb t0, 0(t4)
-7:
- addiu s1, 4
- addiu a2, -1
- bnez a2, 0b
- addiu s0, 8
-8:
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_h2v2_downsample_mips_dspr2)
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
-/*
- * a0 - input_data
- * a1 - output_data
- * a2 - compptr->v_samp_factor
- * a3 - cinfo->max_v_samp_factor
- * 16(sp) - cinfo->smoothing_factor
- * 20(sp) - compptr->width_in_blocks
- * 24(sp) - cinfo->image_width
- */
-
- .set at
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- lw s7, 52(sp) // compptr->width_in_blocks
- lw s0, 56(sp) // cinfo->image_width
- lw s6, 48(sp) // cinfo->smoothing_factor
- sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
- sll v0, s7, 1
- subu v0, v0, s0
- blez v0, 2f
- move v1, zero
- addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
-0:
- addiu t1, a0, -4
- sll t2, v1, 2
- lwx t1, t2(t1)
- move t3, v0
- addu t1, t1, s0
- lbu t2, -1(t1)
-1:
- addiu t3, t3, -1
- sb t2, 0(t1)
- bgtz t3, 1b
- addiu t1, t1, 1
- addiu v1, v1, 1
- bne v1, t0, 0b
- nop
-2:
- li v0, 80
- mul v0, s6, v0
- li v1, 16384
- move t4, zero
- move t5, zero
- subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
- sll t7, s6, 4 // t7 = tmp_smoot_f * 16
-3:
-/* Special case for first column: pretend column -1 is same as column 0 */
- sll v0, t4, 2
- lwx t8, v0(a1) // outptr = output_data[outrow]
- sll v1, t5, 2
- addiu t9, v1, 4
- addiu s0, v1, -4
- addiu s1, v1, 8
- lwx s2, v1(a0) // inptr0 = input_data[inrow]
- lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
- lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
- lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
- lh v0, 0(s2)
- lh v1, 0(t9)
- lh t0, 0(s0)
- lh t1, 0(s1)
- ins v0, v1, 16, 16
- ins t0, t1, 16, 16
- raddu.w.qb t2, v0
- raddu.w.qb s3, t0
- lbu v0, 0(s2)
- lbu v1, 2(s2)
- lbu t0, 0(t9)
- lbu t1, 2(t9)
- addu v0, v0, v1
- mult $ac1,t2, t6
- addu t0, t0, t1
- lbu t2, 2(s0)
- addu t0, t0, v0
- lbu t3, 2(s1)
- addu s3, t0, s3
- lbu v0, 0(s0)
- lbu t0, 0(s1)
- sll s3, s3, 1
- addu v0, v0, t2
- addu t0, t0, t3
- addu t0, t0, v0
- addu s3, t0, s3
- madd $ac1,s3, t7
- extr_r.w v0, $ac1, 16
- addiu t8, t8, 1
- addiu s2, s2, 2
- addiu t9, t9, 2
- addiu s0, s0, 2
- addiu s1, s1, 2
- sb v0, -1(t8)
- addiu s4, s7, -2
- and s4, s4, 3
- addu s5, s4, t8 //end adress
-4:
- lh v0, 0(s2)
- lh v1, 0(t9)
- lh t0, 0(s0)
- lh t1, 0(s1)
- ins v0, v1, 16, 16
- ins t0, t1, 16, 16
- raddu.w.qb t2, v0
- raddu.w.qb s3, t0
- lbu v0, -1(s2)
- lbu v1, 2(s2)
- lbu t0, -1(t9)
- lbu t1, 2(t9)
- addu v0, v0, v1
- mult $ac1, t2, t6
- addu t0, t0, t1
- lbu t2, 2(s0)
- addu t0, t0, v0
- lbu t3, 2(s1)
- addu s3, t0, s3
- lbu v0, -1(s0)
- lbu t0, -1(s1)
- sll s3, s3, 1
- addu v0, v0, t2
- addu t0, t0, t3
- addu t0, t0, v0
- addu s3, t0, s3
- madd $ac1, s3, t7
- extr_r.w t2, $ac1, 16
- addiu t8, t8, 1
- addiu s2, s2, 2
- addiu t9, t9, 2
- addiu s0, s0, 2
- sb t2, -1(t8)
- bne s5, t8, 4b
- addiu s1, s1, 2
- addiu s5, s7, -2
- subu s5, s5, s4
- addu s5, s5, t8 //end adress
-5:
- lh v0, 0(s2)
- lh v1, 0(t9)
- lh t0, 0(s0)
- lh t1, 0(s1)
- ins v0, v1, 16, 16
- ins t0, t1, 16, 16
- raddu.w.qb t2, v0
- raddu.w.qb s3, t0
- lbu v0, -1(s2)
- lbu v1, 2(s2)
- lbu t0, -1(t9)
- lbu t1, 2(t9)
- addu v0, v0, v1
- mult $ac1, t2, t6
- addu t0, t0, t1
- lbu t2, 2(s0)
- addu t0, t0, v0
- lbu t3, 2(s1)
- addu s3, t0, s3
- lbu v0, -1(s0)
- lbu t0, -1(s1)
- sll s3, s3, 1
- addu v0, v0, t2
- addu t0, t0, t3
- lh v1, 2(t9)
- addu t0, t0, v0
- lh v0, 2(s2)
- addu s3, t0, s3
- lh t0, 2(s0)
- lh t1, 2(s1)
- madd $ac1, s3, t7
- extr_r.w t2, $ac1, 16
- ins t0, t1, 16, 16
- ins v0, v1, 16, 16
- raddu.w.qb s3, t0
- lbu v1, 4(s2)
- lbu t0, 1(t9)
- lbu t1, 4(t9)
- sb t2, 0(t8)
- raddu.w.qb t3, v0
- lbu v0, 1(s2)
- addu t0, t0, t1
- mult $ac1, t3, t6
- addu v0, v0, v1
- lbu t2, 4(s0)
- addu t0, t0, v0
- lbu v0, 1(s0)
- addu s3, t0, s3
- lbu t0, 1(s1)
- lbu t3, 4(s1)
- addu v0, v0, t2
- sll s3, s3, 1
- addu t0, t0, t3
- lh v1, 4(t9)
- addu t0, t0, v0
- lh v0, 4(s2)
- addu s3, t0, s3
- lh t0, 4(s0)
- lh t1, 4(s1)
- madd $ac1, s3, t7
- extr_r.w t2, $ac1, 16
- ins t0, t1, 16, 16
- ins v0, v1, 16, 16
- raddu.w.qb s3, t0
- lbu v1, 6(s2)
- lbu t0, 3(t9)
- lbu t1, 6(t9)
- sb t2, 1(t8)
- raddu.w.qb t3, v0
- lbu v0, 3(s2)
- addu t0, t0,t1
- mult $ac1, t3, t6
- addu v0, v0, v1
- lbu t2, 6(s0)
- addu t0, t0, v0
- lbu v0, 3(s0)
- addu s3, t0, s3
- lbu t0, 3(s1)
- lbu t3, 6(s1)
- addu v0, v0, t2
- sll s3, s3, 1
- addu t0, t0, t3
- lh v1, 6(t9)
- addu t0, t0, v0
- lh v0, 6(s2)
- addu s3, t0, s3
- lh t0, 6(s0)
- lh t1, 6(s1)
- madd $ac1, s3, t7
- extr_r.w t3, $ac1, 16
- ins t0, t1, 16, 16
- ins v0, v1, 16, 16
- raddu.w.qb s3, t0
- lbu v1, 8(s2)
- lbu t0, 5(t9)
- lbu t1, 8(t9)
- sb t3, 2(t8)
- raddu.w.qb t2, v0
- lbu v0, 5(s2)
- addu t0, t0, t1
- mult $ac1, t2, t6
- addu v0, v0, v1
- lbu t2, 8(s0)
- addu t0, t0, v0
- lbu v0, 5(s0)
- addu s3, t0, s3
- lbu t0, 5(s1)
- lbu t3, 8(s1)
- addu v0, v0, t2
- sll s3, s3, 1
- addu t0, t0, t3
- addiu t8, t8, 4
- addu t0, t0, v0
- addiu s2, s2, 8
- addu s3, t0, s3
- addiu t9, t9, 8
- madd $ac1, s3, t7
- extr_r.w t1, $ac1, 16
- addiu s0, s0, 8
- addiu s1, s1, 8
- bne s5, t8, 5b
- sb t1, -1(t8)
-/* Special case for last column */
- lh v0, 0(s2)
- lh v1, 0(t9)
- lh t0, 0(s0)
- lh t1, 0(s1)
- ins v0, v1, 16, 16
- ins t0, t1, 16, 16
- raddu.w.qb t2, v0
- raddu.w.qb s3, t0
- lbu v0, -1(s2)
- lbu v1, 1(s2)
- lbu t0, -1(t9)
- lbu t1, 1(t9)
- addu v0, v0, v1
- mult $ac1, t2, t6
- addu t0, t0, t1
- lbu t2, 1(s0)
- addu t0, t0, v0
- lbu t3, 1(s1)
- addu s3, t0, s3
- lbu v0, -1(s0)
- lbu t0, -1(s1)
- sll s3, s3, 1
- addu v0, v0, t2
- addu t0, t0, t3
- addu t0, t0, v0
- addu s3, t0, s3
- madd $ac1, s3, t7
- extr_r.w t0, $ac1, 16
- addiu t5, t5, 2
- sb t0, 0(t8)
- addiu t4, t4, 1
- bne t4, a2, 3b
- addiu t5, t5, 2
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-
-END(jsimd_h2v2_smooth_downsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
-/*
- * a0 - upsample->h_expand[compptr->component_index]
- * a1 - upsample->v_expand[compptr->component_index]
- * a2 - input_data
- * a3 - output_data_ptr
- * 16(sp) - cinfo->output_width
- * 20(sp) - cinfo->max_v_samp_factor
- */
- .set at
-
- SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
- lw s0, 0(a3) // s0 = output_data
- lw s1, 32(sp) // s1 = cinfo->output_width
- lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
- li t6, 0 // t6 = inrow
- beqz s2, 10f
- li s3, 0 // s3 = outrow
-0:
- addu t0, a2, t6
- addu t7, s0, s3
- lw t3, 0(t0) // t3 = inptr
- lw t8, 0(t7) // t8 = outptr
- beqz s1, 4f
- addu t5, t8, s1 // t5 = outend
-1:
- lb t2, 0(t3) // t2 = invalue = *inptr++
- addiu t3, 1
- beqz a0, 3f
- move t0, a0 // t0 = h_expand
-2:
- sb t2, 0(t8)
- addiu t0, -1
- bgtz t0, 2b
- addiu t8, 1
-3:
- bgt t5, t8, 1b
- nop
-4:
- addiu t9, a1, -1 // t9 = v_expand - 1
- blez t9, 9f
- nop
-5:
- lw t3, 0(s0)
- lw t4, 4(s0)
- subu t0, s1, 0xF
- blez t0, 7f
- addu t5, t3, s1 // t5 = end address
- andi t7, s1, 0xF // t7 = residual
- subu t8, t5, t7
-6:
- ulw t0, 0(t3)
- ulw t1, 4(t3)
- ulw t2, 8(t3)
- usw t0, 0(t4)
- ulw t0, 12(t3)
- usw t1, 4(t4)
- usw t2, 8(t4)
- usw t0, 12(t4)
- addiu t3, 16
- bne t3, t8, 6b
- addiu t4, 16
- beqz t7, 8f
- nop
-7:
- lbu t0, 0(t3)
- sb t0, 0(t4)
- addiu t3, 1
- bne t3, t5, 7b
- addiu t4, 1
-8:
- addiu t9, -1
- bgtz t9, 5b
- addiu s0, 8
-9:
- addu s3, s3, a1
- bne s3, s2, 0b
- addiu t6, 1
-10:
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
- j ra
- nop
-END(jsimd_int_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
-/*
- * a0 - cinfo->max_v_samp_factor
- * a1 - cinfo->output_width
- * a2 - input_data
- * a3 - output_data_ptr
- */
- lw t7, 0(a3) // t7 = output_data
- andi t8, a1, 0xf // t8 = residual
- sll t0, a0, 2
- blez a0, 4f
- addu t9, t7, t0 // t9 = output_data end address
-0:
- lw t5, 0(t7) // t5 = outptr
- lw t6, 0(a2) // t6 = inptr
- addu t3, t5, a1 // t3 = outptr + output_width (end address)
- subu t3, t8 // t3 = end address - residual
- beq t5, t3, 2f
- move t4, t8
-1:
- ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
- ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
- srl t1, t0, 16 // t1 = |X|X|P3|P2|
- ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
- ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
- ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
- ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
- usw t0, 0(t5)
- usw t1, 4(t5)
- srl t0, t2, 16 // t0 = |X|X|P7|P6|
- ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
- ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
- ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
- ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
- usw t2, 8(t5)
- usw t0, 12(t5)
- addiu t5, 16
- bne t5, t3, 1b
- addiu t6, 8
- beqz t8, 3f
- move t4, t8
-2:
- lbu t1, 0(t6)
- sb t1, 0(t5)
- sb t1, 1(t5)
- addiu t4, -2
- addiu t6, 1
- bgtz t4, 2b
- addiu t5, 2
-3:
- addiu t7, 4
- bne t9, t7, 0b
- addiu a2, 4
-4:
- j ra
- nop
-END(jsimd_h2v1_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
-/*
- * a0 - cinfo->max_v_samp_factor
- * a1 - cinfo->output_width
- * a2 - input_data
- * a3 - output_data_ptr
- */
- lw t7, 0(a3)
- blez a0, 7f
- andi t9, a1, 0xf // t9 = residual
-0:
- lw t6, 0(a2) // t6 = inptr
- lw t5, 0(t7) // t5 = outptr
- addu t8, t5, a1 // t8 = outptr end address
- subu t8, t9 // t8 = end address - residual
- beq t5, t8, 2f
- move t4, t9
-1:
- ulw t0, 0(t6)
- srl t1, t0, 16
- ins t0, t0, 16, 16
- ins t0, t0, 8, 16
- ins t1, t1, 16, 16
- ins t1, t1, 8, 16
- ulw t2, 4(t6)
- usw t0, 0(t5)
- usw t1, 4(t5)
- srl t3, t2, 16
- ins t2, t2, 16, 16
- ins t2, t2, 8, 16
- ins t3, t3, 16, 16
- ins t3, t3, 8, 16
- usw t2, 8(t5)
- usw t3, 12(t5)
- addiu t5, 16
- bne t5, t8, 1b
- addiu t6, 8
- beqz t9, 3f
- move t4, t9
-2:
- lbu t0, 0(t6)
- sb t0, 0(t5)
- sb t0, 1(t5)
- addiu t4, -2
- addiu t6, 1
- bgtz t4, 2b
- addiu t5, 2
-3:
- lw t6, 0(t7) // t6 = outptr[0]
- lw t5, 4(t7) // t5 = outptr[1]
- addu t4, t6, a1 // t4 = new end address
- beq a1, t9, 5f
- subu t8, t4, t9
-4:
- ulw t0, 0(t6)
- ulw t1, 4(t6)
- ulw t2, 8(t6)
- usw t0, 0(t5)
- ulw t0, 12(t6)
- usw t1, 4(t5)
- usw t2, 8(t5)
- usw t0, 12(t5)
- addiu t6, 16
- bne t6, t8, 4b
- addiu t5, 16
- beqz t9, 6f
- nop
-5:
- lbu t0, 0(t6)
- sb t0, 0(t5)
- addiu t6, 1
- bne t6, t4, 5b
- addiu t5, 1
-6:
- addiu t7, 8
- addiu a0, -2
- bgtz a0, 0b
- addiu a2, 4
-7:
- j ra
- nop
-END(jsimd_h2v2_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
-/*
- * a0 - coef_block
- * a1 - compptr->dcttable
- * a2 - output
- * a3 - range_limit
- */
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- addiu sp, sp, -256
- move v0, sp
- addiu v1, zero, 8 // v1 = DCTSIZE = 8
-1:
- lh s4, 32(a0) // s4 = inptr[16]
- lh s5, 64(a0) // s5 = inptr[32]
- lh s6, 96(a0) // s6 = inptr[48]
- lh t1, 112(a0) // t1 = inptr[56]
- lh t7, 16(a0) // t7 = inptr[8]
- lh t5, 80(a0) // t5 = inptr[40]
- lh t3, 48(a0) // t3 = inptr[24]
- or s4, s4, t1
- or s4, s4, t3
- or s4, s4, t5
- or s4, s4, t7
- or s4, s4, s5
- or s4, s4, s6
- bnez s4, 2f
- addiu v1, v1, -1
- lh s5, 0(a1) // quantptr[DCTSIZE*0]
- lh s6, 0(a0) // inptr[DCTSIZE*0]
- mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
- sll s5, s5, 2
- sw s5, 0(v0)
- sw s5, 32(v0)
- sw s5, 64(v0)
- sw s5, 96(v0)
- sw s5, 128(v0)
- sw s5, 160(v0)
- sw s5, 192(v0)
- b 3f
- sw s5, 224(v0)
-2:
- lh t0, 112(a1)
- lh t2, 48(a1)
- lh t4, 80(a1)
- lh t6, 16(a1)
- mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
- mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
- mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
- mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
- lh t4, 32(a1)
- lh t5, 32(a0)
- lh t6, 96(a1)
- lh t7, 96(a0)
- addu s0, t0, t1 // z3 = tmp0 + tmp2
- addu s1, t1, t2 // z2 = tmp1 + tmp2
- addu s2, t2, t3 // z4 = tmp1 + tmp3
- addu s3, s0, s2 // z3 + z4
- addiu t9, zero, 9633 // FIX_1_175875602
- mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
- addu t8, t0, t3 // z1 = tmp0 + tmp3
- addiu t9, zero, 2446 // FIX_0_298631336
- mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
- addiu t9, zero, 16819 // FIX_2_053119869
- mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
- addiu t9, zero, 25172 // FIX_3_072711026
- mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
- addiu t9, zero, 12299 // FIX_1_501321110
- mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
- addiu t9, zero, 16069 // FIX_1_961570560
- mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
- addiu t9, zero, 3196 // FIX_0_390180644
- mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
- addiu t9, zero, 7373 // FIX_0_899976223
- mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
- addiu t9, zero, 20995 // FIX_2_562915447
- mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
- subu s0, s3, s0 // z3 += z5
- addu t0, t0, s0 // tmp0 += z3
- addu t1, t1, s0 // tmp2 += z3
- subu s2, s3, s2 // z4 += z5
- addu t2, t2, s2 // tmp1 += z4
- addu t3, t3, s2 // tmp3 += z4
- subu t0, t0, t8 // tmp0 += z1
- subu t1, t1, s1 // tmp2 += z2
- subu t2, t2, s1 // tmp1 += z2
- subu t3, t3, t8 // tmp3 += z1
- mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
- addiu t9, zero, 6270 // FIX_0_765366865
- mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
- lh t4, 0(a1)
- lh t5, 0(a0)
- lh t6, 64(a1)
- lh t7, 64(a0)
- mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
- mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
- mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
- addiu t9, zero, 4433 // FIX_0_541196100
- addu s3, s0, s1 // z2 + z3
- mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
- addiu t9, zero, 15137 // FIX_1_847759065
- mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
- addu t4, t5, t6
- subu t5, t5, t6
- sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
- sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
- addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
- subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
- addu s0, t4, t7
- subu s1, t4, t7
- addu s2, t5, t6
- subu s3, t5, t6
- addu t4, s0, t3
- subu s0, s0, t3
- addu t3, s2, t1
- subu s2, s2, t1
- addu t1, s3, t2
- subu s3, s3, t2
- addu t2, s1, t0
- subu s1, s1, t0
- shra_r.w t4, t4, 11
- shra_r.w t3, t3, 11
- shra_r.w t1, t1, 11
- shra_r.w t2, t2, 11
- shra_r.w s1, s1, 11
- shra_r.w s3, s3, 11
- shra_r.w s2, s2, 11
- shra_r.w s0, s0, 11
- sw t4, 0(v0)
- sw t3, 32(v0)
- sw t1, 64(v0)
- sw t2, 96(v0)
- sw s1, 128(v0)
- sw s3, 160(v0)
- sw s2, 192(v0)
- sw s0, 224(v0)
-3:
- addiu a1, a1, 2
- addiu a0, a0, 2
- bgtz v1, 1b
- addiu v0, v0, 4
- move v0, sp
- addiu v1, zero, 8
-4:
- lw t0, 8(v0) // z2 = (JLONG) wsptr[2]
- lw t1, 24(v0) // z3 = (JLONG) wsptr[6]
- lw t2, 0(v0) // (JLONG) wsptr[0]
- lw t3, 16(v0) // (JLONG) wsptr[4]
- lw s4, 4(v0) // (JLONG) wsptr[1]
- lw s5, 12(v0) // (JLONG) wsptr[3]
- lw s6, 20(v0) // (JLONG) wsptr[5]
- lw s7, 28(v0) // (JLONG) wsptr[7]
- or s4, s4, t0
- or s4, s4, t1
- or s4, s4, t3
- or s4, s4, s7
- or s4, s4, s5
- or s4, s4, s6
- bnez s4, 5f
- addiu v1, v1, -1
- shra_r.w s5, t2, 5
- andi s5, s5, 0x3ff
- lbux s5, s5(a3)
- lw s1, 0(a2)
- replv.qb s5, s5
- usw s5, 0(s1)
- usw s5, 4(s1)
- b 6f
- nop
-5:
- addu t4, t0, t1 // z2 + z3
- addiu t8, zero, 4433 // FIX_0_541196100
- mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
- addiu t8, zero, 15137 // FIX_1_847759065
- mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
- addiu t8, zero, 6270 // FIX_0_765366865
- mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
- addu t4, t2, t3 // (JLONG) wsptr[0] + (JLONG) wsptr[4]
- subu t2, t2, t3 // (JLONG) wsptr[0] - (JLONG) wsptr[4]
- sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
- sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
- subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
- subu t3, t2, t1 // tmp12 = tmp1 - tmp2
- addu t2, t2, t1 // tmp11 = tmp1 + tmp2
- addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
- subu t1, t4, t5 // tmp13 = tmp0 - tmp3
- addu t0, t4, t5 // tmp10 = tmp0 + tmp3
- lw t4, 28(v0) // tmp0 = (JLONG) wsptr[7]
- lw t6, 12(v0) // tmp2 = (JLONG) wsptr[3]
- lw t5, 20(v0) // tmp1 = (JLONG) wsptr[5]
- lw t7, 4(v0) // tmp3 = (JLONG) wsptr[1]
- addu s0, t4, t6 // z3 = tmp0 + tmp2
- addiu t8, zero, 9633 // FIX_1_175875602
- addu s1, t5, t7 // z4 = tmp1 + tmp3
- addu s2, s0, s1 // z3 + z4
- mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
- addu s3, t4, t7 // z1 = tmp0 + tmp3
- addu t9, t5, t6 // z2 = tmp1 + tmp2
- addiu t8, zero, 16069 // FIX_1_961570560
- mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
- addiu t8, zero, 3196 // FIX_0_390180644
- mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
- addiu t8, zero, 2446 // FIX_0_298631336
- mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
- addiu t8, zero, 7373 // FIX_0_899976223
- mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
- addiu t8, zero, 16819 // FIX_2_053119869
- mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
- addiu t8, zero, 20995 // FIX_2_562915447
- mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
- addiu t8, zero, 25172 // FIX_3_072711026
- mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
- addiu t8, zero, 12299 // FIX_1_501321110
- mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
- subu s0, s2, s0 // z3 += z5
- subu s1, s2, s1 // z4 += z5
- addu t4, t4, s0
- subu t4, t4, s3 // tmp0
- addu t5, t5, s1
- subu t5, t5, t9 // tmp1
- addu t6, t6, s0
- subu t6, t6, t9 // tmp2
- addu t7, t7, s1
- subu t7, t7, s3 // tmp3
- addu s0, t0, t7
- subu t0, t0, t7
- addu t7, t2, t6
- subu t2, t2, t6
- addu t6, t3, t5
- subu t3, t3, t5
- addu t5, t1, t4
- subu t1, t1, t4
- shra_r.w s0, s0, 18
- shra_r.w t7, t7, 18
- shra_r.w t6, t6, 18
- shra_r.w t5, t5, 18
- shra_r.w t1, t1, 18
- shra_r.w t3, t3, 18
- shra_r.w t2, t2, 18
- shra_r.w t0, t0, 18
- andi s0, s0, 0x3ff
- andi t7, t7, 0x3ff
- andi t6, t6, 0x3ff
- andi t5, t5, 0x3ff
- andi t1, t1, 0x3ff
- andi t3, t3, 0x3ff
- andi t2, t2, 0x3ff
- andi t0, t0, 0x3ff
- lw s1, 0(a2)
- lbux s0, s0(a3)
- lbux t7, t7(a3)
- lbux t6, t6(a3)
- lbux t5, t5(a3)
- lbux t1, t1(a3)
- lbux t3, t3(a3)
- lbux t2, t2(a3)
- lbux t0, t0(a3)
- sb s0, 0(s1)
- sb t7, 1(s1)
- sb t6, 2(s1)
- sb t5, 3(s1)
- sb t1, 4(s1)
- sb t3, 5(s1)
- sb t2, 6(s1)
- sb t0, 7(s1)
-6:
- addiu v0, v0, 32
- bgtz v1, 4b
- addiu a2, a2, 4
- addiu sp, sp, 256
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-
-END(jsimd_idct_islow_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
-/*
- * a0 - inptr
- * a1 - quantptr
- * a2 - wsptr
- * a3 - mips_idct_ifast_coefs
- */
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- addiu t9, a0, 16 // end address
- or AT, a3, zero
-
-0:
- lw s0, 0(a1) // quantptr[DCTSIZE*0]
- lw t0, 0(a0) // inptr[DCTSIZE*0]
- lw t1, 16(a0) // inptr[DCTSIZE*1]
- muleq_s.w.phl v0, t0, s0 // tmp0 ...
- lw t2, 32(a0) // inptr[DCTSIZE*2]
- lw t3, 48(a0) // inptr[DCTSIZE*3]
- lw t4, 64(a0) // inptr[DCTSIZE*4]
- lw t5, 80(a0) // inptr[DCTSIZE*5]
- muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
- lw t6, 96(a0) // inptr[DCTSIZE*6]
- lw t7, 112(a0) // inptr[DCTSIZE*7]
- or s4, t1, t2
- or s5, t3, t4
- bnez s4, 1f
- ins t0, v0, 16, 16 // ... tmp0
- bnez s5, 1f
- or s6, t5, t6
- or s6, s6, t7
- bnez s6, 1f
- sw t0, 0(a2) // wsptr[DCTSIZE*0]
- sw t0, 16(a2) // wsptr[DCTSIZE*1]
- sw t0, 32(a2) // wsptr[DCTSIZE*2]
- sw t0, 48(a2) // wsptr[DCTSIZE*3]
- sw t0, 64(a2) // wsptr[DCTSIZE*4]
- sw t0, 80(a2) // wsptr[DCTSIZE*5]
- sw t0, 96(a2) // wsptr[DCTSIZE*6]
- sw t0, 112(a2) // wsptr[DCTSIZE*7]
- addiu a0, a0, 4
- b 2f
- addiu a1, a1, 4
-
-1:
- lw s1, 32(a1) // quantptr[DCTSIZE*2]
- lw s2, 64(a1) // quantptr[DCTSIZE*4]
- muleq_s.w.phl v0, t2, s1 // tmp1 ...
- muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
- lw s0, 16(a1) // quantptr[DCTSIZE*1]
- lw s1, 48(a1) // quantptr[DCTSIZE*3]
- lw s3, 96(a1) // quantptr[DCTSIZE*6]
- muleq_s.w.phl v1, t4, s2 // tmp2 ...
- muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
- lw s2, 80(a1) // quantptr[DCTSIZE*5]
- lw t8, 4(AT) // FIX(1.414213562)
- ins t2, v0, 16, 16 // ... tmp1
- muleq_s.w.phl v0, t6, s3 // tmp3 ...
- muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
- ins t4, v1, 16, 16 // ... tmp2
- addq.ph s4, t0, t4 // tmp10
- subq.ph s5, t0, t4 // tmp11
- ins t6, v0, 16, 16 // ... tmp3
- subq.ph s6, t2, t6 // tmp12 ...
- addq.ph s7, t2, t6 // tmp13
- mulq_s.ph s6, s6, t8 // ... tmp12 ...
- addq.ph t0, s4, s7 // tmp0
- subq.ph t6, s4, s7 // tmp3
- muleq_s.w.phl v0, t1, s0 // tmp4 ...
- muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
- shll_s.ph s6, s6, 1 // x2
- lw s3, 112(a1) // quantptr[DCTSIZE*7]
- subq.ph s6, s6, s7 // ... tmp12
- muleq_s.w.phl v1, t7, s3 // tmp7 ...
- muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
- ins t1, v0, 16, 16 // ... tmp4
- addq.ph t2, s5, s6 // tmp1
- subq.ph t4, s5, s6 // tmp2
- muleq_s.w.phl v0, t5, s2 // tmp6 ...
- muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
- ins t7, v1, 16, 16 // ... tmp7
- addq.ph s5, t1, t7 // z11
- subq.ph s6, t1, t7 // z12
- muleq_s.w.phl v1, t3, s1 // tmp5 ...
- muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
- ins t5, v0, 16, 16 // ... tmp6
- ins t3, v1, 16, 16 // ... tmp5
- addq.ph s7, t5, t3 // z13
- subq.ph v0, t5, t3 // z10
- addq.ph t7, s5, s7 // tmp7
- subq.ph s5, s5, s7 // tmp11 ...
- addq.ph v1, v0, s6 // z5 ...
- mulq_s.ph s5, s5, t8 // ... tmp11
- lw t8, 8(AT) // FIX(1.847759065)
- lw s4, 0(AT) // FIX(1.082392200)
- addq.ph s0, t0, t7
- subq.ph s1, t0, t7
- mulq_s.ph v1, v1, t8 // ... z5
- shll_s.ph s5, s5, 1 // x2
- lw t8, 12(AT) // FIX(-2.613125930)
- sw s0, 0(a2) // wsptr[DCTSIZE*0]
- shll_s.ph v0, v0, 1 // x4
- mulq_s.ph v0, v0, t8 // tmp12 ...
- mulq_s.ph s4, s6, s4 // tmp10 ...
- shll_s.ph v1, v1, 1 // x2
- addiu a0, a0, 4
- addiu a1, a1, 4
- sw s1, 112(a2) // wsptr[DCTSIZE*7]
- shll_s.ph s6, v0, 1 // x4
- shll_s.ph s4, s4, 1 // x2
- addq.ph s6, s6, v1 // ... tmp12
- subq.ph t5, s6, t7 // tmp6
- subq.ph s4, s4, v1 // ... tmp10
- subq.ph t3, s5, t5 // tmp5
- addq.ph s2, t2, t5
- addq.ph t1, s4, t3 // tmp4
- subq.ph s3, t2, t5
- sw s2, 16(a2) // wsptr[DCTSIZE*1]
- sw s3, 96(a2) // wsptr[DCTSIZE*6]
- addq.ph v0, t4, t3
- subq.ph v1, t4, t3
- sw v0, 32(a2) // wsptr[DCTSIZE*2]
- sw v1, 80(a2) // wsptr[DCTSIZE*5]
- addq.ph v0, t6, t1
- subq.ph v1, t6, t1
- sw v0, 64(a2) // wsptr[DCTSIZE*4]
- sw v1, 48(a2) // wsptr[DCTSIZE*3]
-
-2:
- bne a0, t9, 0b
- addiu a2, a2, 4
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-
-END(jsimd_idct_ifast_cols_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
-/*
- * a0 - wsptr
- * a1 - output_buf
- * a2 - output_col
- * a3 - mips_idct_ifast_coefs
- */
-
- SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
- addiu t9, a0, 128 // end address
- lui s8, 0x8080
- ori s8, s8, 0x8080
-
-0:
- lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
- lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
- lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
- lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
- lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
- lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
- lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
- lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
- lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
- precrq.ph.w t1, s0, t0 // B b
- ins t0, s0, 16, 16 // A a
- bnez t1, 1f
- or s0, t2, s2
- bnez s0, 1f
- or s0, t4, s4
- bnez s0, 1f
- or s0, t6, s6
- bnez s0, 1f
- shll_s.ph s0, t0, 2 // A a
- lw a3, 0(a1)
- lw AT, 4(a1)
- precrq.ph.w t0, s0, s0 // A A
- ins s0, s0, 16, 16 // a a
- addu a3, a3, a2
- addu AT, AT, a2
- precrq.qb.ph t0, t0, t0 // A A A A
- precrq.qb.ph s0, s0, s0 // a a a a
- addu.qb s0, s0, s8
- addu.qb t0, t0, s8
- sw s0, 0(a3)
- sw s0, 4(a3)
- sw t0, 0(AT)
- sw t0, 4(AT)
- addiu a0, a0, 32
- bne a0, t9, 0b
- addiu a1, a1, 8
- b 2f
- nop
-
-1:
- precrq.ph.w t3, s2, t2
- ins t2, s2, 16, 16
- precrq.ph.w t5, s4, t4
- ins t4, s4, 16, 16
- precrq.ph.w t7, s6, t6
- ins t6, s6, 16, 16
- lw t8, 4(AT) // FIX(1.414213562)
- addq.ph s4, t0, t4 // tmp10
- subq.ph s5, t0, t4 // tmp11
- subq.ph s6, t2, t6 // tmp12 ...
- addq.ph s7, t2, t6 // tmp13
- mulq_s.ph s6, s6, t8 // ... tmp12 ...
- addq.ph t0, s4, s7 // tmp0
- subq.ph t6, s4, s7 // tmp3
- shll_s.ph s6, s6, 1 // x2
- subq.ph s6, s6, s7 // ... tmp12
- addq.ph t2, s5, s6 // tmp1
- subq.ph t4, s5, s6 // tmp2
- addq.ph s5, t1, t7 // z11
- subq.ph s6, t1, t7 // z12
- addq.ph s7, t5, t3 // z13
- subq.ph v0, t5, t3 // z10
- addq.ph t7, s5, s7 // tmp7
- subq.ph s5, s5, s7 // tmp11 ...
- addq.ph v1, v0, s6 // z5 ...
- mulq_s.ph s5, s5, t8 // ... tmp11
- lw t8, 8(AT) // FIX(1.847759065)
- lw s4, 0(AT) // FIX(1.082392200)
- addq.ph s0, t0, t7 // tmp0 + tmp7
- subq.ph s7, t0, t7 // tmp0 - tmp7
- mulq_s.ph v1, v1, t8 // ... z5
- lw a3, 0(a1)
- lw t8, 12(AT) // FIX(-2.613125930)
- shll_s.ph s5, s5, 1 // x2
- addu a3, a3, a2
- shll_s.ph v0, v0, 1 // x4
- mulq_s.ph v0, v0, t8 // tmp12 ...
- mulq_s.ph s4, s6, s4 // tmp10 ...
- shll_s.ph v1, v1, 1 // x2
- addiu a0, a0, 32
- addiu a1, a1, 8
- shll_s.ph s6, v0, 1 // x4
- shll_s.ph s4, s4, 1 // x2
- addq.ph s6, s6, v1 // ... tmp12
- shll_s.ph s0, s0, 2
- subq.ph t5, s6, t7 // tmp6
- subq.ph s4, s4, v1 // ... tmp10
- subq.ph t3, s5, t5 // tmp5
- shll_s.ph s7, s7, 2
- addq.ph t1, s4, t3 // tmp4
- addq.ph s1, t2, t5 // tmp1 + tmp6
- subq.ph s6, t2, t5 // tmp1 - tmp6
- addq.ph s2, t4, t3 // tmp2 + tmp5
- subq.ph s5, t4, t3 // tmp2 - tmp5
- addq.ph s4, t6, t1 // tmp3 + tmp4
- subq.ph s3, t6, t1 // tmp3 - tmp4
- shll_s.ph s1, s1, 2
- shll_s.ph s2, s2, 2
- shll_s.ph s3, s3, 2
- shll_s.ph s4, s4, 2
- shll_s.ph s5, s5, 2
- shll_s.ph s6, s6, 2
- precrq.ph.w t0, s1, s0 // B A
- ins s0, s1, 16, 16 // b a
- precrq.ph.w t2, s3, s2 // D C
- ins s2, s3, 16, 16 // d c
- precrq.ph.w t4, s5, s4 // F E
- ins s4, s5, 16, 16 // f e
- precrq.ph.w t6, s7, s6 // H G
- ins s6, s7, 16, 16 // h g
- precrq.qb.ph t0, t2, t0 // D C B A
- precrq.qb.ph s0, s2, s0 // d c b a
- precrq.qb.ph t4, t6, t4 // H G F E
- precrq.qb.ph s4, s6, s4 // h g f e
- addu.qb s0, s0, s8
- addu.qb s4, s4, s8
- sw s0, 0(a3) // outptr[0/1/2/3] d c b a
- sw s4, 4(a3) // outptr[4/5/6/7] h g f e
- lw a3, -4(a1)
- addu.qb t0, t0, s8
- addu a3, a3, a2
- addu.qb t4, t4, s8
- sw t0, 0(a3) // outptr[0/1/2/3] D C B A
- bne a0, t9, 0b
- sw t4, 4(a3) // outptr[4/5/6/7] H G F E
-
-2:
-
- RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
- j ra
- nop
-
-END(jsimd_idct_ifast_rows_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
-/*
- * a0 - data
- */
-
- SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
- lui t0, 6437
- ori t0, 2260
- lui t1, 9633
- ori t1, 11363
- lui t2, 0xd39e
- ori t2, 0xe6dc
- lui t3, 0xf72d
- ori t3, 9633
- lui t4, 2261
- ori t4, 9633
- lui t5, 0xd39e
- ori t5, 6437
- lui t6, 9633
- ori t6, 0xd39d
- lui t7, 0xe6dc
- ori t7, 2260
- lui t8, 4433
- ori t8, 10703
- lui t9, 0xd630
- ori t9, 4433
- li s8, 8
- move a1, a0
-1:
- lw s0, 0(a1) // tmp0 = 1|0
- lw s1, 4(a1) // tmp1 = 3|2
- lw s2, 8(a1) // tmp2 = 5|4
- lw s3, 12(a1) // tmp3 = 7|6
- packrl.ph s1, s1, s1 // tmp1 = 2|3
- packrl.ph s3, s3, s3 // tmp3 = 6|7
- subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
- subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
- dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
- mult $ac1, $0, $0 // ac1 = 0
- dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
- dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
- mult $ac2, $0, $0 // ac2 = 0
- dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
- dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
- mult $ac3, $0, $0 // ac3 = 0
- dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
- dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
- addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
- addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
- extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
- extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
- extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
- extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
- addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
- subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
- sh s0, 2(a1)
- sh s1, 6(a1)
- sh s2, 10(a1)
- sh s3, 14(a1)
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
- mult $ac1, $0, $0 // ac1 = 0
- dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
- sra s4, s5, 16 // tmp4 = t11
- addiu a1, a1, 16
- addiu s8, s8, -1
- extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
- extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
- addu s2, s5, s4 // tmp2 = t10 + t11
- subu s3, s5, s4 // tmp3 = t10 - t11
- sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
- sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
- sh s2, -16(a1)
- sh s3, -8(a1)
- sh s0, -12(a1)
- bgtz s8, 1b
- sh s1, -4(a1)
- li t0, 2260
- li t1, 11363
- li t2, 9633
- li t3, 6436
- li t4, 6437
- li t5, 2261
- li t6, 11362
- li t7, 2259
- li t8, 4433
- li t9, 10703
- li a1, 10704
- li s8, 8
-
-2:
- lh a2, 0(a0) // 0
- lh a3, 16(a0) // 8
- lh v0, 32(a0) // 16
- lh v1, 48(a0) // 24
- lh s4, 64(a0) // 32
- lh s5, 80(a0) // 40
- lh s6, 96(a0) // 48
- lh s7, 112(a0) // 56
- addu s2, v0, s5 // tmp2 = 16 + 40
- subu s5, v0, s5 // tmp5 = 16 - 40
- addu s3, v1, s4 // tmp3 = 24 + 32
- subu s4, v1, s4 // tmp4 = 24 - 32
- addu s0, a2, s7 // tmp0 = 0 + 56
- subu s7, a2, s7 // tmp7 = 0 - 56
- addu s1, a3, s6 // tmp1 = 8 + 48
- subu s6, a3, s6 // tmp6 = 8 - 48
- addu a2, s0, s3 // tmp10 = tmp0 + tmp3
- subu v1, s0, s3 // tmp13 = tmp0 - tmp3
- addu a3, s1, s2 // tmp11 = tmp1 + tmp2
- subu v0, s1, s2 // tmp12 = tmp1 - tmp2
- mult s7, t1 // ac0 = tmp7 * c1
- madd s4, t0 // ac0 += tmp4 * c0
- madd s5, t4 // ac0 += tmp5 * c4
- madd s6, t2 // ac0 += tmp6 * c2
- mult $ac1, s7, t2 // ac1 = tmp7 * c2
- msub $ac1, s4, t3 // ac1 -= tmp4 * c3
- msub $ac1, s5, t6 // ac1 -= tmp5 * c6
- msub $ac1, s6, t7 // ac1 -= tmp6 * c7
- mult $ac2, s7, t4 // ac2 = tmp7 * c4
- madd $ac2, s4, t2 // ac2 += tmp4 * c2
- madd $ac2, s5, t5 // ac2 += tmp5 * c5
- msub $ac2, s6, t6 // ac2 -= tmp6 * c6
- mult $ac3, s7, t0 // ac3 = tmp7 * c0
- msub $ac3, s4, t1 // ac3 -= tmp4 * c1
- madd $ac3, s5, t2 // ac3 += tmp5 * c2
- msub $ac3, s6, t3 // ac3 -= tmp6 * c3
- extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
- extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
- extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
- extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
- addiu s8, s8, -1
- addu s4, a2, a3 // tmp4 = tmp10 + tmp11
- subu s5, a2, a3 // tmp5 = tmp10 - tmp11
- sh s0, 16(a0)
- sh s1, 48(a0)
- sh s2, 80(a0)
- sh s3, 112(a0)
- mult v0, t8 // ac0 = tmp12 * c8
- madd v1, t9 // ac0 += tmp13 * c9
- mult $ac1, v1, t8 // ac1 = tmp13 * c8
- msub $ac1, v0, a1 // ac1 -= tmp12 * c10
- addiu a0, a0, 2
- extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
- extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
- shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
- shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
- sh s4, -2(a0)
- sh s5, 62(a0)
- sh s6, 30(a0)
- bgtz s8, 2b
- sh s7, 94(a0)
-
- RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
- jr ra
- nop
-
-END(jsimd_fdct_islow_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
-/*
- * a0 - data
- */
- .set at
- SAVE_REGS_ON_STACK 8, s0, s1
- li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
- li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
- li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
- li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
-
- move v0, a0
- addiu v1, v0, 128 // end address
-
-0:
- lw t0, 0(v0) // tmp0 = 1|0
- lw t1, 4(v0) // tmp1 = 3|2
- lw t2, 8(v0) // tmp2 = 5|4
- lw t3, 12(v0) // tmp3 = 7|6
- packrl.ph t1, t1, t1 // tmp1 = 2|3
- packrl.ph t3, t3, t3 // tmp3 = 6|7
- subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
- subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
- addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
- addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
- addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
- subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
- sra t4, t8, 16 // tmp4 = t11
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, t9, s1
- mult $ac1, $0, $0 // ac1 = 0
- dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
- dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
- mult $ac2, $0, $0 // ac2 = 0
- dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
- mult $ac3, $0, $0 // ac3 = 0
- dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
- precrq.ph.w t0, t5, t7 // t0 = t5|t6
- addq.ph t2, t8, t4 // tmp2 = t10 + t11
- subq.ph t3, t8, t4 // tmp3 = t10 - t11
- extr.w t4, $ac0, 8
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
- extr.w t0, $ac1, 8 // t0 = z5
- extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
- extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
- extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
- add t6, t1, t0 // t6 = z2
- add t7, t7, t0 // t7 = z4
- subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
- addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
- addq.ph t1, t0, t6 // t1 = z13 + z2
- subq.ph t6, t0, t6 // t6 = z13 - z2
- addq.ph t0, t8, t7 // t0 = z11 + z4
- subq.ph t7, t8, t7 // t7 = z11 - z4
- addq.ph t5, t4, t9
- subq.ph t4, t9, t4
- sh t2, 0(v0)
- sh t5, 4(v0)
- sh t3, 8(v0)
- sh t4, 12(v0)
- sh t1, 10(v0)
- sh t6, 6(v0)
- sh t0, 2(v0)
- sh t7, 14(v0)
- addiu v0, 16
- bne v1, v0, 0b
- nop
- move v0, a0
- addiu v1, v0, 16
-
-1:
- lh t0, 0(v0) // 0
- lh t1, 16(v0) // 8
- lh t2, 32(v0) // 16
- lh t3, 48(v0) // 24
- lh t4, 64(v0) // 32
- lh t5, 80(v0) // 40
- lh t6, 96(v0) // 48
- lh t7, 112(v0) // 56
- add t8, t0, t7 // t8 = tmp0
- sub t7, t0, t7 // t7 = tmp7
- add t0, t1, t6 // t0 = tmp1
- sub t1, t1, t6 // t1 = tmp6
- add t6, t2, t5 // t6 = tmp2
- sub t5, t2, t5 // t5 = tmp5
- add t2, t3, t4 // t2 = tmp3
- sub t3, t3, t4 // t3 = tmp4
- add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
- sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
- sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
- ins t8, s0, 16, 16 // t8 = tmp12|tmp13
- add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
- add s0, t4, t2 // t8 = tmp10+tmp11
- sub t4, t4, t2 // t4 = tmp10-tmp11
- sh s0, 0(v0)
- sh t4, 64(v0)
- extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
- addq.ph t4, t8, t2 // t9 = tmp13 + z1
- subq.ph t8, t8, t2 // t2 = tmp13 - z1
- sh t4, 32(v0)
- sh t8, 96(v0)
- add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
- add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
- add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
- andi t4, a1, 0xffff
- mul s0, t1, t4
- sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
- ins t1, t3, 16, 16 // t1 = tmp10|tmp12
- mult $0, $0 // ac0 = 0
- mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
- extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
- add t2, t7, t8 // t2 = tmp7 + z5
- sub t7, t7, t8 // t7 = tmp7 - z5
- andi t4, a2, 0xffff
- mul t8, t3, t4
- sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
- andi t4, s1, 0xffff
- mul t6, t0, t4
- sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
- add t0, t6, t8 // t0 = z3 + z2
- sub t1, t6, t8 // t1 = z3 - z2
- add t3, t6, s0 // t3 = z3 + z4
- sub t4, t6, s0 // t4 = z3 - z4
- sub t5, t2, t1 // t5 = dataptr[5]
- sub t6, t7, t0 // t6 = dataptr[3]
- add t3, t2, t3 // t3 = dataptr[1]
- add t4, t7, t4 // t4 = dataptr[7]
- sh t5, 80(v0)
- sh t6, 48(v0)
- sh t3, 16(v0)
- sh t4, 112(v0)
- addiu v0, 2
- bne v0, v1, 1b
- nop
-
- RESTORE_REGS_FROM_STACK 8, s0, s1
-
- j ra
- nop
-END(jsimd_fdct_ifast_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
-/*
- * a0 - coef_block
- * a1 - divisors
- * a2 - workspace
- */
-
- .set at
-
- SAVE_REGS_ON_STACK 16, s0, s1, s2
-
- addiu v0, a2, 124 // v0 = workspace_end
- lh t0, 0(a2)
- lh t1, 0(a1)
- lh t2, 128(a1)
- sra t3, t0, 15
- sll t3, t3, 1
- addiu t3, t3, 1
- mul t0, t0, t3
- lh t4, 384(a1)
- lh t5, 130(a1)
- lh t6, 2(a2)
- lh t7, 2(a1)
- lh t8, 386(a1)
-
-1:
- andi t1, 0xffff
- add t9, t0, t2
- andi t9, 0xffff
- mul v1, t9, t1
- sra s0, t6, 15
- sll s0, s0, 1
- addiu s0, s0, 1
- addiu t9, t4, 16
- srav v1, v1, t9
- mul v1, v1, t3
- mul t6, t6, s0
- andi t7, 0xffff
- addiu a2, a2, 4
- addiu a1, a1, 4
- add s1, t6, t5
- andi s1, 0xffff
- sh v1, 0(a0)
-
- mul s2, s1, t7
- addiu s1, t8, 16
- srav s2, s2, s1
- mul s2,s2, s0
- lh t0, 0(a2)
- lh t1, 0(a1)
- sra t3, t0, 15
- sll t3, t3, 1
- addiu t3, t3, 1
- mul t0, t0, t3
- lh t2, 128(a1)
- lh t4, 384(a1)
- lh t5, 130(a1)
- lh t8, 386(a1)
- lh t6, 2(a2)
- lh t7, 2(a1)
- sh s2, 2(a0)
- lh t0, 0(a2)
- sra t3, t0, 15
- sll t3, t3, 1
- addiu t3, t3, 1
- mul t0, t0,t3
- bne a2, v0, 1b
- addiu a0, a0, 4
-
- andi t1, 0xffff
- add t9, t0, t2
- andi t9, 0xffff
- mul v1, t9, t1
- sra s0, t6, 15
- sll s0, s0, 1
- addiu s0, s0, 1
- addiu t9, t4, 16
- srav v1, v1, t9
- mul v1, v1, t3
- mul t6, t6, s0
- andi t7, 0xffff
- sh v1, 0(a0)
- add s1, t6, t5
- andi s1, 0xffff
- mul s2, s1, t7
- addiu s1, t8, 16
- addiu a2, a2, 4
- addiu a1, a1, 4
- srav s2, s2, s1
- mul s2, s2, s0
- sh s2, 2(a0)
-
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2
-
- j ra
- nop
-
-END(jsimd_quantize_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
-/*
- * a0 - coef_block
- * a1 - divisors
- * a2 - workspace
- */
-
- .set at
-
- li t1, 0x46800100 //integer representation 16384.5
- mtc1 t1, f0
- li t0, 63
-0:
- lwc1 f2, 0(a2)
- lwc1 f10, 0(a1)
- lwc1 f4, 4(a2)
- lwc1 f12, 4(a1)
- lwc1 f6, 8(a2)
- lwc1 f14, 8(a1)
- lwc1 f8, 12(a2)
- lwc1 f16, 12(a1)
- madd.s f2, f0, f2, f10
- madd.s f4, f0, f4, f12
- madd.s f6, f0, f6, f14
- madd.s f8, f0, f8, f16
- lwc1 f10, 16(a1)
- lwc1 f12, 20(a1)
- trunc.w.s f2, f2
- trunc.w.s f4, f4
- trunc.w.s f6, f6
- trunc.w.s f8, f8
- lwc1 f14, 24(a1)
- lwc1 f16, 28(a1)
- mfc1 t1, f2
- mfc1 t2, f4
- mfc1 t3, f6
- mfc1 t4, f8
- lwc1 f2, 16(a2)
- lwc1 f4, 20(a2)
- lwc1 f6, 24(a2)
- lwc1 f8, 28(a2)
- madd.s f2, f0, f2, f10
- madd.s f4, f0, f4, f12
- madd.s f6, f0, f6, f14
- madd.s f8, f0, f8, f16
- addiu t1, t1, -16384
- addiu t2, t2, -16384
- addiu t3, t3, -16384
- addiu t4, t4, -16384
- trunc.w.s f2, f2
- trunc.w.s f4, f4
- trunc.w.s f6, f6
- trunc.w.s f8, f8
- sh t1, 0(a0)
- sh t2, 2(a0)
- sh t3, 4(a0)
- sh t4, 6(a0)
- mfc1 t1, f2
- mfc1 t2, f4
- mfc1 t3, f6
- mfc1 t4, f8
- addiu t0, t0, -8
- addiu a2, a2, 32
- addiu a1, a1, 32
- addiu t1, t1, -16384
- addiu t2, t2, -16384
- addiu t3, t3, -16384
- addiu t4, t4, -16384
- sh t1, 8(a0)
- sh t2, 10(a0)
- sh t3, 12(a0)
- sh t4, 14(a0)
- bgez t0, 0b
- addiu a0, a0, 16
-
- j ra
- nop
-
-END(jsimd_quantize_float_mips_dspr2)
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
-/*
- * a0 - compptr->dct_table
- * a1 - coef_block
- * a2 - output_buf
- * a3 - output_col
- */
- .set at
-
- SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
- addiu sp, sp, -40
- move v0, sp
- addiu s2, zero, 29692
- addiu s3, zero, -10426
- addiu s4, zero, 6967
- addiu s5, zero, -5906
- lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
- lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
- lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
- lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
- mul t4, t5, t0
- lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
- lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
- mul t6, t6, t1
- mul t5, t5, t0
- lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
- lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
- lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
- lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
- mul t7, t7, t2
- mult zero, zero
- mul t8, t8, t3
- li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
- li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
- ins t6, t5, 16, 16 // t6 = t5|t6
- sll t4, t4, 15
- dpa.w.ph $ac0, t6, s0
- lh t1, 2(a1)
- lh t6, 2(a0)
- ins t8, t7, 16, 16 // t8 = t7|t8
- dpa.w.ph $ac0, t8, s1
- mflo t0, $ac0
- mul t5, t6, t1
- lh t1, 18(a1)
- lh t6, 18(a0)
- lh t2, 50(a1)
- lh t7, 50(a0)
- mul t6, t6, t1
- subu t8, t4, t0
- mul t7, t7, t2
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- lh t1, 82(a1)
- lh t2, 82(a0)
- lh t3, 114(a1)
- lh t4, 114(a0)
- shra_r.w t8, t8, 13
- mul t1, t1, t2
- mul t3, t3, t4
- sw t0, 0(v0)
- sw t8, 20(v0)
- sll t4, t5, 15
- ins t7, t6, 16, 16
- mult zero, zero
- dpa.w.ph $ac0, t7, s0
- ins t3, t1, 16, 16
- lh t1, 6(a1)
- lh t6, 6(a0)
- dpa.w.ph $ac0, t3, s1
- mflo t0, $ac0
- mul t5, t6, t1
- lh t1, 22(a1)
- lh t6, 22(a0)
- lh t2, 54(a1)
- lh t7, 54(a0)
- mul t6, t6, t1
- subu t8, t4, t0
- mul t7, t7, t2
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- lh t1, 86(a1)
- lh t2, 86(a0)
- lh t3, 118(a1)
- lh t4, 118(a0)
- shra_r.w t8, t8, 13
- mul t1, t1, t2
- mul t3, t3, t4
- sw t0, 4(v0)
- sw t8, 24(v0)
- sll t4, t5, 15
- ins t7, t6, 16, 16
- mult zero, zero
- dpa.w.ph $ac0, t7, s0
- ins t3, t1, 16, 16
- lh t1, 10(a1)
- lh t6, 10(a0)
- dpa.w.ph $ac0, t3, s1
- mflo t0, $ac0
- mul t5, t6, t1
- lh t1, 26(a1)
- lh t6, 26(a0)
- lh t2, 58(a1)
- lh t7, 58(a0)
- mul t6, t6, t1
- subu t8, t4, t0
- mul t7, t7, t2
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- lh t1, 90(a1)
- lh t2, 90(a0)
- lh t3, 122(a1)
- lh t4, 122(a0)
- shra_r.w t8, t8, 13
- mul t1, t1, t2
- mul t3, t3, t4
- sw t0, 8(v0)
- sw t8, 28(v0)
- sll t4, t5, 15
- ins t7, t6, 16, 16
- mult zero, zero
- dpa.w.ph $ac0, t7, s0
- ins t3, t1, 16, 16
- lh t1, 14(a1)
- lh t6, 14(a0)
- dpa.w.ph $ac0, t3, s1
- mflo t0, $ac0
- mul t5, t6, t1
- lh t1, 30(a1)
- lh t6, 30(a0)
- lh t2, 62(a1)
- lh t7, 62(a0)
- mul t6, t6, t1
- subu t8, t4, t0
- mul t7, t7, t2
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- lh t1, 94(a1)
- lh t2, 94(a0)
- lh t3, 126(a1)
- lh t4, 126(a0)
- shra_r.w t8, t8, 13
- mul t1, t1, t2
- mul t3, t3, t4
- sw t0, 12(v0)
- sw t8, 32(v0)
- sll t4, t5, 15
- ins t7, t6, 16, 16
- mult zero, zero
- dpa.w.ph $ac0, t7, s0
- ins t3, t1, 16, 16
- dpa.w.ph $ac0, t3, s1
- mflo t0, $ac0
- lw t9, 0(a2)
- lw t3, 0(v0)
- lw t7, 4(v0)
- lw t1, 8(v0)
- addu t9, t9, a3
- sll t3, t3, 15
- subu t8, t4, t0
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- shra_r.w t8, t8, 13
- sw t0, 16(v0)
- sw t8, 36(v0)
- lw t5, 12(v0)
- lw t6, 16(v0)
- mult t7, s2
- madd t1, s3
- madd t5, s4
- madd t6, s5
- lw t5, 24(v0)
- lw t7, 28(v0)
- mflo t0, $ac0
- lw t8, 32(v0)
- lw t2, 36(v0)
- mult $ac1, t5, s2
- madd $ac1, t7, s3
- madd $ac1, t8, s4
- madd $ac1, t2, s5
- addu t1, t3, t0
- subu t6, t3, t0
- shra_r.w t1, t1, 20
- shra_r.w t6, t6, 20
- mflo t4, $ac1
- shll_s.w t1, t1, 24
- shll_s.w t6, t6, 24
- sra t1, t1, 24
- sra t6, t6, 24
- addiu t1, t1, 128
- addiu t6, t6, 128
- lw t0, 20(v0)
- sb t1, 0(t9)
- sb t6, 1(t9)
- sll t0, t0, 15
- lw t9, 4(a2)
- addu t1, t0, t4
- subu t6, t0, t4
- addu t9, t9, a3
- shra_r.w t1, t1, 20
- shra_r.w t6, t6, 20
- shll_s.w t1, t1, 24
- shll_s.w t6, t6, 24
- sra t1, t1, 24
- sra t6, t6, 24
- addiu t1, t1, 128
- addiu t6, t6, 128
- sb t1, 0(t9)
- sb t6, 1(t9)
- addiu sp, sp, 40
-
- RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
- j ra
- nop
-
-END(jsimd_idct_2x2_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
-/*
- * a0 - compptr->dct_table
- * a1 - coef_block
- * a2 - output_buf
- * a3 - output_col
- * 16(sp) - workspace[DCTSIZE*4]; // buffers data between passes
- */
-
- .set at
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- lw v1, 48(sp)
- move t0, a1
- move t1, v1
- li t9, 4
- li s0, 0x2e75f93e
- li s1, 0x21f9ba79
- li s2, 0xecc2efb0
- li s3, 0x52031ccd
-
-0:
- lh s6, 32(t0) // inptr[DCTSIZE*2]
- lh t6, 32(a0) // quantptr[DCTSIZE*2]
- lh s7, 96(t0) // inptr[DCTSIZE*6]
- lh t7, 96(a0) // quantptr[DCTSIZE*6]
- mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
- lh s4, 0(t0) // inptr[DCTSIZE*0]
- mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
- lh s5, 0(a0) // quantptr[0]
- li s6, 15137
- li s7, 6270
- mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
- mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
- lh t5, 112(t0) // inptr[DCTSIZE*7]
- mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
- lh s4, 112(a0) // quantptr[DCTSIZE*7]
- lh v0, 80(t0) // inptr[DCTSIZE*5]
- lh s5, 80(a0) // quantptr[DCTSIZE*5]
- lh s6, 48(a0) // quantptr[DCTSIZE*3]
- sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
- lh s7, 16(a0) // quantptr[DCTSIZE*1]
- lh t8, 16(t0) // inptr[DCTSIZE*1]
- subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
- lh t7, 48(t0) // inptr[DCTSIZE*3]
- mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
- mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
- mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
- mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
- addu t3, t2, t6 // tmp10 = tmp0 + z2
- subu t4, t2, t6 // tmp10 = tmp0 - z2
- mult $ac0, zero, zero
- mult $ac1, zero, zero
- ins t5, v0, 16, 16
- ins t7, t8, 16, 16
- addiu t9, t9, -1
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- mflo s4, $ac0
- mflo s5, $ac1
- addiu a0, a0, 2
- addiu t1, t1, 4
- addiu t0, t0, 2
- addu t6, t4, s4
- subu t5, t4, s4
- addu s6, t3, s5
- subu s7, t3, s5
- shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
- shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
- shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
- shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
- sw t6, 28(t1)
- sw t5, 60(t1)
- sw s6, -4(t1)
- bgtz t9, 0b
- sw s7, 92(t1)
- // second loop three pass
- li t9, 3
-1:
- lh s6, 34(t0) // inptr[DCTSIZE*2]
- lh t6, 34(a0) // quantptr[DCTSIZE*2]
- lh s7, 98(t0) // inptr[DCTSIZE*6]
- lh t7, 98(a0) // quantptr[DCTSIZE*6]
- mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
- lh s4, 2(t0) // inptr[DCTSIZE*0]
- mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
- lh s5, 2(a0) // quantptr[DCTSIZE*0]
- li s6, 15137
- li s7, 6270
- mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
- mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
- lh t5, 114(t0) // inptr[DCTSIZE*7]
- mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
- lh s4, 114(a0) // quantptr[DCTSIZE*7]
- lh s5, 82(a0) // quantptr[DCTSIZE*5]
- lh t6, 82(t0) // inptr[DCTSIZE*5]
- sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
- lh s6, 50(a0) // quantptr[DCTSIZE*3]
- lh t8, 18(t0) // inptr[DCTSIZE*1]
- subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
- lh t7, 50(t0) // inptr[DCTSIZE*3]
- lh s7, 18(a0) // quantptr[DCTSIZE*1]
- mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
- mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
- mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
- mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
- addu t3, t2, v0 // tmp10 = tmp0 + z2
- subu t4, t2, v0 // tmp10 = tmp0 - z2
- mult $ac0, zero, zero
- mult $ac1, zero, zero
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- mflo t5, $ac0
- mflo t6, $ac1
- addiu t9, t9, -1
- addiu t0, t0, 2
- addiu a0, a0, 2
- addiu t1, t1, 4
- addu s5, t4, t5
- subu s4, t4, t5
- addu s6, t3, t6
- subu s7, t3, t6
- shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
- shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
- shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
- shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
- sw s5, 32(t1)
- sw s4, 64(t1)
- sw s6, 0(t1)
- bgtz t9, 1b
- sw s7, 96(t1)
- move t1, v1
- li s4, 15137
- lw s6, 8(t1) // wsptr[2]
- li s5, 6270
- lw s7, 24(t1) // wsptr[6]
- mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
- lw t2, 0(t1) // wsptr[0]
- mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
- lh t5, 28(t1) // wsptr[7]
- lh t6, 20(t1) // wsptr[5]
- lh t7, 12(t1) // wsptr[3]
- lh t8, 4(t1) // wsptr[1]
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- mult $ac0, zero, zero
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- mult $ac1, zero, zero
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
- mflo s6, $ac0
- // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
- subu s4, s4, s5
- addu t3, t2, s4 // tmp10 = tmp0 + z2
- mflo s7, $ac1
- subu t4, t2, s4 // tmp10 = tmp0 - z2
- addu t7, t4, s6
- subu t8, t4, s6
- addu t5, t3, s7
- subu t6, t3, s7
- shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
- shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
- shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
- shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
- sll s4, t9, 2
- lw v0, 0(a2) // output_buf[ctr]
- shll_s.w t5, t5, 24
- shll_s.w t6, t6, 24
- shll_s.w t7, t7, 24
- shll_s.w t8, t8, 24
- sra t5, t5, 24
- sra t6, t6, 24
- sra t7, t7, 24
- sra t8, t8, 24
- addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
- addiu t5, t5, 128
- addiu t6, t6, 128
- addiu t7, t7, 128
- addiu t8, t8, 128
- sb t5, 0(v0)
- sb t7, 1(v0)
- sb t8, 2(v0)
- sb t6, 3(v0)
- // 2
- li s4, 15137
- lw s6, 40(t1) // wsptr[2]
- li s5, 6270
- lw s7, 56(t1) // wsptr[6]
- mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
- lw t2, 32(t1) // wsptr[0]
- mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
- lh t5, 60(t1) // wsptr[7]
- lh t6, 52(t1) // wsptr[5]
- lh t7, 44(t1) // wsptr[3]
- lh t8, 36(t1) // wsptr[1]
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- mult $ac0, zero, zero
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- mult $ac1, zero, zero
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
- mflo s6, $ac0
- // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
- subu s4, s4, s5
- addu t3, t2, s4 // tmp10 = tmp0 + z2
- mflo s7, $ac1
- subu t4, t2, s4 // tmp10 = tmp0 - z2
- addu t7, t4, s6
- subu t8, t4, s6
- addu t5, t3, s7
- subu t6, t3, s7
- shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
- shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
- shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
- shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
- sll s4, t9, 2
- lw v0, 4(a2) // output_buf[ctr]
- shll_s.w t5, t5, 24
- shll_s.w t6, t6, 24
- shll_s.w t7, t7, 24
- shll_s.w t8, t8, 24
- sra t5, t5, 24
- sra t6, t6, 24
- sra t7, t7, 24
- sra t8, t8, 24
- addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
- addiu t5, t5, 128
- addiu t6, t6, 128
- addiu t7, t7, 128
- addiu t8, t8, 128
- sb t5, 0(v0)
- sb t7, 1(v0)
- sb t8, 2(v0)
- sb t6, 3(v0)
- // 3
- li s4, 15137
- lw s6, 72(t1) // wsptr[2]
- li s5, 6270
- lw s7, 88(t1) // wsptr[6]
- mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
- lw t2, 64(t1) // wsptr[0]
- mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
- lh t5, 92(t1) // wsptr[7]
- lh t6, 84(t1) // wsptr[5]
- lh t7, 76(t1) // wsptr[3]
- lh t8, 68(t1) // wsptr[1]
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- mult $ac0, zero, zero
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- mult $ac1, zero, zero
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
- mflo s6, $ac0
- // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
- subu s4, s4, s5
- addu t3, t2, s4 // tmp10 = tmp0 + z2
- mflo s7, $ac1
- subu t4, t2, s4 // tmp10 = tmp0 - z2
- addu t7, t4, s6
- subu t8, t4, s6
- addu t5, t3, s7
- subu t6, t3, s7
- shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
- shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
- shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
- shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
- sll s4, t9, 2
- lw v0, 8(a2) // output_buf[ctr]
- shll_s.w t5, t5, 24
- shll_s.w t6, t6, 24
- shll_s.w t7, t7, 24
- shll_s.w t8, t8, 24
- sra t5, t5, 24
- sra t6, t6, 24
- sra t7, t7, 24
- sra t8, t8, 24
- addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
- addiu t5, t5, 128
- addiu t6, t6, 128
- addiu t7, t7, 128
- addiu t8, t8, 128
- sb t5, 0(v0)
- sb t7, 1(v0)
- sb t8, 2(v0)
- sb t6, 3(v0)
- li s4, 15137
- lw s6, 104(t1) // wsptr[2]
- li s5, 6270
- lw s7, 120(t1) // wsptr[6]
- mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
- lw t2, 96(t1) // wsptr[0]
- mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
- lh t5, 124(t1) // wsptr[7]
- lh t6, 116(t1) // wsptr[5]
- lh t7, 108(t1) // wsptr[3]
- lh t8, 100(t1) // wsptr[1]
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- mult $ac0, zero, zero
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- mult $ac1, zero, zero
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
- mflo s6, $ac0
- // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
- subu s4, s4, s5
- addu t3, t2, s4 // tmp10 = tmp0 + z2;
- mflo s7, $ac1
- subu t4, t2, s4 // tmp10 = tmp0 - z2;
- addu t7, t4, s6
- subu t8, t4, s6
- addu t5, t3, s7
- subu t6, t3, s7
- shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
- shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
- shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
- shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
- sll s4, t9, 2
- lw v0, 12(a2) // output_buf[ctr]
- shll_s.w t5, t5, 24
- shll_s.w t6, t6, 24
- shll_s.w t7, t7, 24
- shll_s.w t8, t8, 24
- sra t5, t5, 24
- sra t6, t6, 24
- sra t7, t7, 24
- sra t8, t8, 24
- addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
- addiu t5, t5, 128
- addiu t6, t6, 128
- addiu t7, t7, 128
- addiu t8, t8, 128
- sb t5, 0(v0)
- sb t7, 1(v0)
- sb t8, 2(v0)
- sb t6, 3(v0)
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_idct_4x4_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
-/*
- * a0 - compptr->dct_table
- * a1 - coef_block
- * a2 - output_buf
- * a3 - output_col
- */
- .set at
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- addiu sp, sp, -144
- move v0, sp
- addiu v1, v0, 24
- addiu t9, zero, 5793
- addiu s0, zero, 10033
- addiu s1, zero, 2998
-
-1:
- lh s2, 0(a0) // q0 = quantptr[ 0]
- lh s3, 32(a0) // q1 = quantptr[16]
- lh s4, 64(a0) // q2 = quantptr[32]
- lh t2, 64(a1) // tmp2 = inptr[32]
- lh t1, 32(a1) // tmp1 = inptr[16]
- lh t0, 0(a1) // tmp0 = inptr[ 0]
- mul t2, t2, s4 // tmp2 = tmp2 * q2
- mul t1, t1, s3 // tmp1 = tmp1 * q1
- mul t0, t0, s2 // tmp0 = tmp0 * q0
- lh t6, 16(a1) // z1 = inptr[ 8]
- lh t8, 80(a1) // z3 = inptr[40]
- lh t7, 48(a1) // z2 = inptr[24]
- lh s2, 16(a0) // q0 = quantptr[ 8]
- lh s4, 80(a0) // q2 = quantptr[40]
- lh s3, 48(a0) // q1 = quantptr[24]
- mul t2, t2, t9 // tmp2 = tmp2 * 5793
- mul t1, t1, s0 // tmp1 = tmp1 * 10033
- sll t0, t0, 13 // tmp0 = tmp0 << 13
- mul t6, t6, s2 // z1 = z1 * q0
- mul t8, t8, s4 // z3 = z3 * q2
- mul t7, t7, s3 // z2 = z2 * q1
- addu t3, t0, t2 // tmp10 = tmp0 + tmp2
- sll t2, t2, 1 // tmp2 = tmp2 << 2
- subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
- subu t5, t3, t1 // tmp12 = tmp10 - tmp1
- addu t3, t3, t1 // tmp10 = tmp10 + tmp1
- addu t1, t6, t8 // tmp1 = z1 + z3
- mul t1, t1, s1 // tmp1 = tmp1 * 2998
- shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
- subu t2, t6, t8 // tmp2 = z1 - z3
- subu t2, t2, t7 // tmp2 = tmp2 - z2
- sll t2, t2, 2 // tmp2 = tmp2 << 2
- addu t0, t6, t7 // tmp0 = z1 + z2
- sll t0, t0, 13 // tmp0 = tmp0 << 13
- subu s2, t8, t7 // q0 = z3 - z2
- sll s2, s2, 13 // q0 = q0 << 13
- addu t0, t0, t1 // tmp0 = tmp0 + tmp1
- addu t1, s2, t1 // tmp1 = q0 + tmp1
- addu s2, t4, t2 // q0 = tmp11 + tmp2
- subu s3, t4, t2 // q1 = tmp11 - tmp2
- addu t6, t3, t0 // z1 = tmp10 + tmp0
- subu t7, t3, t0 // z2 = tmp10 - tmp0
- addu t4, t5, t1 // tmp11 = tmp12 + tmp1
- subu t5, t5, t1 // tmp12 = tmp12 - tmp1
- shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
- shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
- shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
- shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
- sw s2, 24(v0)
- sw s3, 96(v0)
- sw t6, 0(v0)
- sw t7, 120(v0)
- sw t4, 48(v0)
- sw t5, 72(v0)
- addiu v0, v0, 4
- addiu a1, a1, 2
- bne v0, v1, 1b
- addiu a0, a0, 2
-
- /* Pass 2: process 6 rows from work array, store into output array. */
- move v0, sp
- addiu v1, v0, 144
-
-2:
- lw t0, 0(v0)
- lw t2, 16(v0)
- lw s5, 0(a2)
- addiu t0, t0, 16
- sll t0, t0, 13
- mul t3, t2, t9
- lw t6, 4(v0)
- lw t8, 20(v0)
- lw t7, 12(v0)
- addu s5, s5, a3
- addu s6, t6, t8
- mul s6, s6, s1
- addu t1, t0, t3
- subu t4, t0, t3
- subu t4, t4, t3
- lw t3, 8(v0)
- mul t0, t3, s0
- addu s7, t6, t7
- sll s7, s7, 13
- addu s7, s6, s7
- subu t2, t8, t7
- sll t2, t2, 13
- addu t2, s6, t2
- subu s6, t6, t7
- subu s6, s6, t8
- sll s6, s6, 13
- addu t3, t1, t0
- subu t5, t1, t0
- addu t6, t3, s7
- subu t3, t3, s7
- addu t7, t4, s6
- subu t4, t4, s6
- addu t8, t5, t2
- subu t5, t5, t2
- shll_s.w t6, t6, 6
- shll_s.w t3, t3, 6
- shll_s.w t7, t7, 6
- shll_s.w t4, t4, 6
- shll_s.w t8, t8, 6
- shll_s.w t5, t5, 6
- sra t6, t6, 24
- addiu t6, t6, 128
- sra t3, t3, 24
- addiu t3, t3, 128
- sb t6, 0(s5)
- sra t7, t7, 24
- addiu t7, t7, 128
- sb t3, 5(s5)
- sra t4, t4, 24
- addiu t4, t4, 128
- sb t7, 1(s5)
- sra t8, t8, 24
- addiu t8, t8, 128
- sb t4, 4(s5)
- addiu v0, v0, 24
- sra t5, t5, 24
- addiu t5, t5, 128
- sb t8, 2(s5)
- addiu a2, a2, 4
- bne v0, v1, 2b
- sb t5, 3(s5)
-
- addiu sp, sp, 144
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-
-END(jsimd_idct_6x6_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
-/*
- * a0 - compptr->dct_table
- * a1 - coef_block
- * a2 - workspace
- */
-
- SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
- li a3, 8
-
-1:
- // odd part
- lh t0, 48(a1)
- lh t1, 48(a0)
- lh t2, 16(a1)
- lh t3, 16(a0)
- lh t4, 80(a1)
- lh t5, 80(a0)
- lh t6, 112(a1)
- lh t7, 112(a0)
- mul t0, t0, t1 // z2
- mul t1, t2, t3 // z1
- mul t2, t4, t5 // z3
- mul t3, t6, t7 // z4
- li t4, 10703 // FIX(1.306562965)
- li t5, 4433 // FIX_0_541196100
- li t6, 7053 // FIX(0.860918669)
- mul t4, t0,t4 // tmp11
- mul t5, t0,t5 // -tmp14
- addu t7, t1,t2 // tmp10
- addu t8, t7,t3 // tmp10 + z4
- mul t6, t6, t8 // tmp15
- li t8, 2139 // FIX(0.261052384)
- mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
- li t7, 2295 // FIX(0.280143716)
- mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
- addu t9, t2, t3 // z3 + z4
- li s0, 8565 // FIX(1.045510580)
- mul t9, t9, s0 // -tmp13
- li s0, 12112 // FIX(1.478575242)
- mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
- li s1, 12998 // FIX(1.586706681)
- mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
- li s2, 5540 // FIX(0.676326758)
- mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
- li s3, 16244 // FIX(1.982889723)
- mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
- subu t1, t1, t3 // z1-=z4
- subu t0, t0, t2 // z2-=z3
- addu t2, t0, t1 // z1+z2
- li t3, 4433 // FIX_0_541196100
- mul t2, t2, t3 // z3
- li t3, 6270 // FIX_0_765366865
- mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
- li t3, 15137 // FIX_0_765366865
- mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
- addu t8, t6, t8 // tmp12
- addu t3, t8, t4 // tmp12 + tmp11
- addu t3, t3, t7 // tmp10
- subu t8, t8, t9 // tmp12 + tmp13
- addu s0, t5, s0
- subu t8, t8, s0 // tmp12
- subu t9, t6, t9
- subu s1, s1, t4
- addu t9, t9, s1 // tmp13
- subu t6, t6, t5
- subu t6, t6, s2
- subu t6, t6, s3 // tmp15
- // even part start
- lh t4, 64(a1)
- lh t5, 64(a0)
- lh t7, 32(a1)
- lh s0, 32(a0)
- lh s1, 0(a1)
- lh s2, 0(a0)
- lh s3, 96(a1)
- lh v0, 96(a0)
- mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
- mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
- mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
- mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
- // odd part end
- addu t1, t2, t1 // tmp11
- subu t0, t2, t0 // tmp14
- // update counter and pointers
- addiu a3, a3, -1
- addiu a0, a0, 2
- addiu a1, a1, 2
- // even part rest
- li s1, 10033
- li s2, 11190
- mul t4, t4, s1 // z4
- mul s1, t5, s2 // z4
- sll t5, t5, 13 // z1
- sll t7, t7, 13
- addiu t7, t7, 1024 // z3
- sll s0, s0, 13 // z2
- addu s2, t7, t4 // tmp10
- subu t4, t7, t4 // tmp11
- subu s3, t5, s0 // tmp12
- addu t2, t7, s3 // tmp21
- subu s3, t7, s3 // tmp24
- addu t7, s1, s0 // tmp12
- addu v0, s2, t7 // tmp20
- subu s2, s2, t7 // tmp25
- subu s1, s1, t5 // z4 - z1
- subu s1, s1, s0 // tmp12
- addu s0, t4, s1 // tmp22
- subu t4, t4, s1 // tmp23
- // final output stage
- addu t5, v0, t3
- subu v0, v0, t3
- addu t3, t2, t1
- subu t2, t2, t1
- addu t1, s0, t8
- subu s0, s0, t8
- addu t8, t4, t9
- subu t4, t4, t9
- addu t9, s3, t0
- subu s3, s3, t0
- addu t0, s2, t6
- subu s2, s2, t6
- sra t5, t5, 11
- sra t3, t3, 11
- sra t1, t1, 11
- sra t8, t8, 11
- sra t9, t9, 11
- sra t0, t0, 11
- sra s2, s2, 11
- sra s3, s3, 11
- sra t4, t4, 11
- sra s0, s0, 11
- sra t2, t2, 11
- sra v0, v0, 11
- sw t5, 0(a2)
- sw t3, 32(a2)
- sw t1, 64(a2)
- sw t8, 96(a2)
- sw t9, 128(a2)
- sw t0, 160(a2)
- sw s2, 192(a2)
- sw s3, 224(a2)
- sw t4, 256(a2)
- sw s0, 288(a2)
- sw t2, 320(a2)
- sw v0, 352(a2)
- bgtz a3, 1b
- addiu a2, a2, 4
-
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
- j ra
- nop
-
-END(jsimd_idct_12x12_pass1_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
-/*
- * a0 - workspace
- * a1 - output
- */
-
- SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
- li a3, 12
-
-1:
- // Odd part
- lw t0, 12(a0)
- lw t1, 4(a0)
- lw t2, 20(a0)
- lw t3, 28(a0)
- li t4, 10703 // FIX(1.306562965)
- li t5, 4433 // FIX_0_541196100
- mul t4, t0, t4 // tmp11
- mul t5, t0, t5 // -tmp14
- addu t6, t1, t2 // tmp10
- li t7, 2139 // FIX(0.261052384)
- mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
- addu t6, t6, t3 // tmp10 + z4
- li t8, 7053 // FIX(0.860918669)
- mul t6, t6, t8 // tmp15
- li t8, 2295 // FIX(0.280143716)
- mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
- addu t9, t2, t3 // z3 + z4
- li s0, 8565 // FIX(1.045510580)
- mul t9, t9, s0 // -tmp13
- li s0, 12112 // FIX(1.478575242)
- mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
- li s1, 12998 // FIX(1.586706681)
- mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
- li s2, 5540 // FIX(0.676326758)
- mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
- li s3, 16244 // FIX(1.982889723)
- mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
- subu t1, t1, t3 // z1 -= z4
- subu t0, t0, t2 // z2 -= z3
- addu t2, t1, t0 // z1 + z2
- li t3, 4433 // FIX_0_541196100
- mul t2, t2, t3 // z3
- li t3, 6270 // FIX_0_765366865
- mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
- li t3, 15137 // FIX_1_847759065
- mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
- addu t3, t6, t7 // tmp12
- addu t7, t3, t4
- addu t7, t7, t8 // tmp10
- subu t3, t3, t9
- subu t3, t3, t5
- subu t3, t3, s0 // tmp12
- subu t9, t6, t9
- subu t9, t9, t4
- addu t9, t9, s1 // tmp13
- subu t6, t6, t5
- subu t6, t6, s2
- subu t6, t6, s3 // tmp15
- addu t1, t2, t1 // tmp11
- subu t0, t2, t0 // tmp14
- // even part
- lw t2, 16(a0) // z4
- lw t4, 8(a0) // z1
- lw t5, 0(a0) // z3
- lw t8, 24(a0) // z2
- li s0, 10033 // FIX(1.224744871)
- li s1, 11190 // FIX(1.366025404)
- mul t2, t2, s0 // z4
- mul s0, t4, s1 // z4
- addiu t5, t5, 0x10
- sll t5, t5, 13 // z3
- sll t4, t4, 13 // z1
- sll t8, t8, 13 // z2
- subu s1, t4, t8 // tmp12
- addu s2, t5, t2 // tmp10
- subu t2, t5, t2 // tmp11
- addu s3, t5, s1 // tmp21
- subu s1, t5, s1 // tmp24
- addu t5, s0, t8 // tmp12
- addu v0, s2, t5 // tmp20
- subu t5, s2, t5 // tmp25
- subu t4, s0, t4
- subu t4, t4, t8 // tmp12
- addu t8, t2, t4 // tmp22
- subu t2, t2, t4 // tmp23
- // increment counter and pointers
- addiu a3, a3, -1
- addiu a0, a0, 32
- // Final stage
- addu t4, v0, t7
- subu v0, v0, t7
- addu t7, s3, t1
- subu s3, s3, t1
- addu t1, t8, t3
- subu t8, t8, t3
- addu t3, t2, t9
- subu t2, t2, t9
- addu t9, s1, t0
- subu s1, s1, t0
- addu t0, t5, t6
- subu t5, t5, t6
- sll t4, t4, 4
- sll t7, t7, 4
- sll t1, t1, 4
- sll t3, t3, 4
- sll t9, t9, 4
- sll t0, t0, 4
- sll t5, t5, 4
- sll s1, s1, 4
- sll t2, t2, 4
- sll t8, t8, 4
- sll s3, s3, 4
- sll v0, v0, 4
- shll_s.w t4, t4, 2
- shll_s.w t7, t7, 2
- shll_s.w t1, t1, 2
- shll_s.w t3, t3, 2
- shll_s.w t9, t9, 2
- shll_s.w t0, t0, 2
- shll_s.w t5, t5, 2
- shll_s.w s1, s1, 2
- shll_s.w t2, t2, 2
- shll_s.w t8, t8, 2
- shll_s.w s3, s3, 2
- shll_s.w v0, v0, 2
- srl t4, t4, 24
- srl t7, t7, 24
- srl t1, t1, 24
- srl t3, t3, 24
- srl t9, t9, 24
- srl t0, t0, 24
- srl t5, t5, 24
- srl s1, s1, 24
- srl t2, t2, 24
- srl t8, t8, 24
- srl s3, s3, 24
- srl v0, v0, 24
- lw t6, 0(a1)
- addiu t4, t4, 0x80
- addiu t7, t7, 0x80
- addiu t1, t1, 0x80
- addiu t3, t3, 0x80
- addiu t9, t9, 0x80
- addiu t0, t0, 0x80
- addiu t5, t5, 0x80
- addiu s1, s1, 0x80
- addiu t2, t2, 0x80
- addiu t8, t8, 0x80
- addiu s3, s3, 0x80
- addiu v0, v0, 0x80
- sb t4, 0(t6)
- sb t7, 1(t6)
- sb t1, 2(t6)
- sb t3, 3(t6)
- sb t9, 4(t6)
- sb t0, 5(t6)
- sb t5, 6(t6)
- sb s1, 7(t6)
- sb t2, 8(t6)
- sb t8, 9(t6)
- sb s3, 10(t6)
- sb v0, 11(t6)
- bgtz a3, 1b
- addiu a1, a1, 4
-
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
- jr ra
- nop
-
-END(jsimd_idct_12x12_pass2_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
-/*
- * a0 - sample_data
- * a1 - start_col
- * a2 - workspace
- */
-
- lw t0, 0(a0)
- li t7, 0xff80ff80
- addu t0, t0, a1
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- lw t0, 4(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 0(a2)
- usw t4, 4(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 8(a2)
- usw t6, 12(a2)
-
- lw t0, 8(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 16(a2)
- usw t4, 20(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 24(a2)
- usw t6, 28(a2)
-
- lw t0, 12(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 32(a2)
- usw t4, 36(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 40(a2)
- usw t6, 44(a2)
-
- lw t0, 16(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 48(a2)
- usw t4, 52(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 56(a2)
- usw t6, 60(a2)
-
- lw t0, 20(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 64(a2)
- usw t4, 68(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 72(a2)
- usw t6, 76(a2)
-
- lw t0, 24(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 80(a2)
- usw t4, 84(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 88(a2)
- usw t6, 92(a2)
-
- lw t0, 28(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 96(a2)
- usw t4, 100(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 104(a2)
- usw t6, 108(a2)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 112(a2)
- usw t4, 116(a2)
- usw t5, 120(a2)
- usw t6, 124(a2)
-
- j ra
- nop
-
-END(jsimd_convsamp_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
-/*
- * a0 - sample_data
- * a1 - start_col
- * a2 - workspace
- */
-
- .set at
-
- lw t0, 0(a0)
- addu t0, t0, a1
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 4(a0)
- swc1 f2, 0(a2)
- swc1 f4, 4(a2)
- swc1 f6, 8(a2)
- addu t0, t0, a1
- swc1 f8, 12(a2)
- swc1 f10, 16(a2)
- swc1 f12, 20(a2)
- swc1 f14, 24(a2)
- swc1 f16, 28(a2)
- //elemr 1
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 8(a0)
- swc1 f2, 32(a2)
- swc1 f4, 36(a2)
- swc1 f6, 40(a2)
- addu t0, t0, a1
- swc1 f8, 44(a2)
- swc1 f10, 48(a2)
- swc1 f12, 52(a2)
- swc1 f14, 56(a2)
- swc1 f16, 60(a2)
- //elemr 2
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 12(a0)
- swc1 f2, 64(a2)
- swc1 f4, 68(a2)
- swc1 f6, 72(a2)
- addu t0, t0, a1
- swc1 f8, 76(a2)
- swc1 f10, 80(a2)
- swc1 f12, 84(a2)
- swc1 f14, 88(a2)
- swc1 f16, 92(a2)
- //elemr 3
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 16(a0)
- swc1 f2, 96(a2)
- swc1 f4, 100(a2)
- swc1 f6, 104(a2)
- addu t0, t0, a1
- swc1 f8, 108(a2)
- swc1 f10, 112(a2)
- swc1 f12, 116(a2)
- swc1 f14, 120(a2)
- swc1 f16, 124(a2)
- //elemr 4
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 20(a0)
- swc1 f2, 128(a2)
- swc1 f4, 132(a2)
- swc1 f6, 136(a2)
- addu t0, t0, a1
- swc1 f8, 140(a2)
- swc1 f10, 144(a2)
- swc1 f12, 148(a2)
- swc1 f14, 152(a2)
- swc1 f16, 156(a2)
- //elemr 5
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 24(a0)
- swc1 f2, 160(a2)
- swc1 f4, 164(a2)
- swc1 f6, 168(a2)
- addu t0, t0, a1
- swc1 f8, 172(a2)
- swc1 f10, 176(a2)
- swc1 f12, 180(a2)
- swc1 f14, 184(a2)
- swc1 f16, 188(a2)
- //elemr 6
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 28(a0)
- swc1 f2, 192(a2)
- swc1 f4, 196(a2)
- swc1 f6, 200(a2)
- addu t0, t0, a1
- swc1 f8, 204(a2)
- swc1 f10, 208(a2)
- swc1 f12, 212(a2)
- swc1 f14, 216(a2)
- swc1 f16, 220(a2)
- //elemr 7
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- swc1 f2, 224(a2)
- swc1 f4, 228(a2)
- swc1 f6, 232(a2)
- swc1 f8, 236(a2)
- swc1 f10, 240(a2)
- swc1 f12, 244(a2)
- swc1 f14, 248(a2)
- swc1 f16, 252(a2)
-
- j ra
- nop
-
-END(jsimd_convsamp_float_mips_dspr2)
-
-/*****************************************************************************/
-
diff --git a/media/libjpeg/simd/jsimd_mips_dspr2_asm.h b/media/libjpeg/simd/jsimd_mips_dspr2_asm.h
deleted file mode 100644
index 64f9880482..0000000000
--- a/media/libjpeg/simd/jsimd_mips_dspr2_asm.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013, MIPS Technologies, Inc., California.
- * All Rights Reserved.
- * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
- * Darko Laus (darko.laus@imgtec.com)
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define zero $0
-#define AT $1
-#define v0 $2
-#define v1 $3
-#define a0 $4
-#define a1 $5
-#define a2 $6
-#define a3 $7
-#define t0 $8
-#define t1 $9
-#define t2 $10
-#define t3 $11
-#define t4 $12
-#define t5 $13
-#define t6 $14
-#define t7 $15
-#define s0 $16
-#define s1 $17
-#define s2 $18
-#define s3 $19
-#define s4 $20
-#define s5 $21
-#define s6 $22
-#define s7 $23
-#define t8 $24
-#define t9 $25
-#define k0 $26
-#define k1 $27
-#define gp $28
-#define sp $29
-#define fp $30
-#define s8 $30
-#define ra $31
-
-#define f0 $f0
-#define f1 $f1
-#define f2 $f2
-#define f3 $f3
-#define f4 $f4
-#define f5 $f5
-#define f6 $f6
-#define f7 $f7
-#define f8 $f8
-#define f9 $f9
-#define f10 $f10
-#define f11 $f11
-#define f12 $f12
-#define f13 $f13
-#define f14 $f14
-#define f15 $f15
-#define f16 $f16
-#define f17 $f17
-#define f18 $f18
-#define f19 $f19
-#define f20 $f20
-#define f21 $f21
-#define f22 $f22
-#define f23 $f23
-#define f24 $f24
-#define f25 $f25
-#define f26 $f26
-#define f27 $f27
-#define f28 $f28
-#define f29 $f29
-#define f30 $f30
-#define f31 $f31
-
-/*
- * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
- */
-#define LEAF_MIPS32R2(symbol) \
- .globl symbol; \
- .align 2; \
- .type symbol, @function; \
- .ent symbol, 0; \
-symbol: .frame sp, 0, ra; \
- .set push; \
- .set arch=mips32r2; \
- .set noreorder; \
- .set noat;
-
-/*
- * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
- */
-#define LEAF_MIPS_DSPR2(symbol) \
-LEAF_MIPS32R2(symbol) \
- .set dspr2;
-
-/*
- * END - mark end of function
- */
-#define END(function) \
- .set pop; \
- .end function; \
- .size function,.-function
-
-/*
- * Checks if stack offset is big enough for storing/restoring regs_num
- * number of register to/from stack. Stack offset must be greater than
- * or equal to the number of bytes needed for storing registers (regs_num*4).
- * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
- * preserved for input arguments of the functions, already stored in a0-a3),
- * stack size can be further optimized by utilizing this space.
- */
-.macro CHECK_STACK_OFFSET regs_num, stack_offset
-.if \stack_offset < \regs_num * 4 - 16
-.error "Stack offset too small."
-.endif
-.endm
-
-/*
- * Saves set of registers on stack. Maximum number of registers that
- * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * before registers are pushed in order to provide enough space on stack
- * (offset must be multiple of 4, and must be big enough, as described by
- * CHECK_STACK_OFFSET macro). This macro is intended to be used in
- * combination with RESTORE_REGS_FROM_STACK macro. Example:
- * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
- * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
- r2 = 0, r3 = 0, r4 = 0, \
- r5 = 0, r6 = 0, r7 = 0, \
- r8 = 0, r9 = 0, r10 = 0, \
- r11 = 0, r12 = 0, r13 = 0, \
- r14 = 0
- .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
- .error "Stack offset must be pozitive and multiple of 4."
- .endif
- .if \stack_offset != 0
- addiu sp, sp, -\stack_offset
- .endif
- sw \r1, 0(sp)
- .if \r2 != 0
- sw \r2, 4(sp)
- .endif
- .if \r3 != 0
- sw \r3, 8(sp)
- .endif
- .if \r4 != 0
- sw \r4, 12(sp)
- .endif
- .if \r5 != 0
- CHECK_STACK_OFFSET 5, \stack_offset
- sw \r5, 16(sp)
- .endif
- .if \r6 != 0
- CHECK_STACK_OFFSET 6, \stack_offset
- sw \r6, 20(sp)
- .endif
- .if \r7 != 0
- CHECK_STACK_OFFSET 7, \stack_offset
- sw \r7, 24(sp)
- .endif
- .if \r8 != 0
- CHECK_STACK_OFFSET 8, \stack_offset
- sw \r8, 28(sp)
- .endif
- .if \r9 != 0
- CHECK_STACK_OFFSET 9, \stack_offset
- sw \r9, 32(sp)
- .endif
- .if \r10 != 0
- CHECK_STACK_OFFSET 10, \stack_offset
- sw \r10, 36(sp)
- .endif
- .if \r11 != 0
- CHECK_STACK_OFFSET 11, \stack_offset
- sw \r11, 40(sp)
- .endif
- .if \r12 != 0
- CHECK_STACK_OFFSET 12, \stack_offset
- sw \r12, 44(sp)
- .endif
- .if \r13 != 0
- CHECK_STACK_OFFSET 13, \stack_offset
- sw \r13, 48(sp)
- .endif
- .if \r14 != 0
- CHECK_STACK_OFFSET 14, \stack_offset
- sw \r14, 52(sp)
- .endif
-.endm
-
-/*
- * Restores set of registers from stack. Maximum number of registers that
- * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * after registers are restored (offset must be multiple of 4, and must
- * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
- * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
- * Example:
- * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
- * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
- r2 = 0, r3 = 0, r4 = 0, \
- r5 = 0, r6 = 0, r7 = 0, \
- r8 = 0, r9 = 0, r10 = 0, \
- r11 = 0, r12 = 0, r13 = 0, \
- r14 = 0
- .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
- .error "Stack offset must be pozitive and multiple of 4."
- .endif
- lw \r1, 0(sp)
- .if \r2 != 0
- lw \r2, 4(sp)
- .endif
- .if \r3 != 0
- lw \r3, 8(sp)
- .endif
- .if \r4 != 0
- lw \r4, 12(sp)
- .endif
- .if \r5 != 0
- CHECK_STACK_OFFSET 5, \stack_offset
- lw \r5, 16(sp)
- .endif
- .if \r6 != 0
- CHECK_STACK_OFFSET 6, \stack_offset
- lw \r6, 20(sp)
- .endif
- .if \r7 != 0
- CHECK_STACK_OFFSET 7, \stack_offset
- lw \r7, 24(sp)
- .endif
- .if \r8 != 0
- CHECK_STACK_OFFSET 8, \stack_offset
- lw \r8, 28(sp)
- .endif
- .if \r9 != 0
- CHECK_STACK_OFFSET 9, \stack_offset
- lw \r9, 32(sp)
- .endif
- .if \r10 != 0
- CHECK_STACK_OFFSET 10, \stack_offset
- lw \r10, 36(sp)
- .endif
- .if \r11 != 0
- CHECK_STACK_OFFSET 11, \stack_offset
- lw \r11, 40(sp)
- .endif
- .if \r12 != 0
- CHECK_STACK_OFFSET 12, \stack_offset
- lw \r12, 44(sp)
- .endif
- .if \r13 != 0
- CHECK_STACK_OFFSET 13, \stack_offset
- lw \r13, 48(sp)
- .endif
- .if \r14 != 0
- CHECK_STACK_OFFSET 14, \stack_offset
- lw \r14, 52(sp)
- .endif
- .if \stack_offset != 0
- addiu sp, sp, \stack_offset
- .endif
-.endm
-
-
diff --git a/media/libjpeg/simd/jsimd_powerpc.c b/media/libjpeg/simd/jsimd_powerpc.c
deleted file mode 100644
index 42dc1e0868..0000000000
--- a/media/libjpeg/simd/jsimd_powerpc.c
+++ /dev/null
@@ -1,828 +0,0 @@
-/*
- * jsimd_powerpc.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * PowerPC architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature (char *buffer, char *feature)
-{
- char *p;
- if (*feature == 0)
- return 0;
- if (strncmp(buffer, "cpu", 3) != 0)
- return 0;
- buffer += 3;
- while (isspace(*buffer))
- buffer++;
-
- /* Check if 'feature' is present in the buffer as a separate word */
- while ((p = strstr(buffer, feature))) {
- if (p > buffer && !isspace(*(p - 1))) {
- buffer++;
- continue;
- }
- p += strlen(feature);
- if (*p != 0 && !isspace(*p)) {
- buffer++;
- continue;
- }
- return 1;
- }
- return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
- char *buffer = (char *)malloc(bufsize);
- FILE *fd;
- simd_support = 0;
-
- if (!buffer)
- return 0;
-
- fd = fopen("/proc/cpuinfo", "r");
- if (fd) {
- while (fgets(buffer, bufsize, fd)) {
- if (!strchr(buffer, '\n') && !feof(fd)) {
- /* "impossible" happened - insufficient size of the buffer! */
- fclose(fd);
- free(buffer);
- return 0;
- }
- if (check_feature(buffer, "altivec"))
- simd_support |= JSIMD_ALTIVEC;
- }
- fclose(fd);
- }
- free(buffer);
- return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
- char *env = NULL;
-#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
- int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
- if (simd_support != ~0U)
- return;
-
- simd_support = 0;
-
-#if defined(__ALTIVEC__) || defined(__APPLE__)
- simd_support |= JSIMD_ALTIVEC;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
- while (!parse_proc_cpuinfo(bufsize)) {
- bufsize *= 2;
- if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
- break;
- }
-#endif
-
- /* Force different settings through environment variables */
- env = getenv("JSIMD_FORCEALTIVEC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = JSIMD_ALTIVEC;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- altivecfct=jsimd_extrgb_ycc_convert_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct=jsimd_extrgbx_ycc_convert_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct=jsimd_extbgr_ycc_convert_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct=jsimd_extbgrx_ycc_convert_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct=jsimd_extxbgr_ycc_convert_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct=jsimd_extxrgb_ycc_convert_altivec;
- break;
- default:
- altivecfct=jsimd_rgb_ycc_convert_altivec;
- break;
- }
-
- altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- altivecfct=jsimd_extrgb_gray_convert_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct=jsimd_extrgbx_gray_convert_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct=jsimd_extbgr_gray_convert_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct=jsimd_extbgrx_gray_convert_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct=jsimd_extxbgr_gray_convert_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct=jsimd_extxrgb_gray_convert_altivec;
- break;
- default:
- altivecfct=jsimd_rgb_gray_convert_altivec;
- break;
- }
-
- altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
- void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- altivecfct=jsimd_ycc_extrgb_convert_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct=jsimd_ycc_extrgbx_convert_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct=jsimd_ycc_extbgr_convert_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct=jsimd_ycc_extbgrx_convert_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct=jsimd_ycc_extxbgr_convert_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct=jsimd_ycc_extxrgb_convert_altivec;
- break;
- default:
- altivecfct=jsimd_ycc_rgb_convert_altivec;
- break;
- }
-
- altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor,
- compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor,
- compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- altivecfct=jsimd_h2v2_extrgb_merged_upsample_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct=jsimd_h2v2_extrgbx_merged_upsample_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct=jsimd_h2v2_extbgr_merged_upsample_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct=jsimd_h2v2_extbgrx_merged_upsample_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct=jsimd_h2v2_extxbgr_merged_upsample_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct=jsimd_h2v2_extxrgb_merged_upsample_altivec;
- break;
- default:
- altivecfct=jsimd_h2v2_merged_upsample_altivec;
- break;
- }
-
- altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- altivecfct=jsimd_h2v1_extrgb_merged_upsample_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct=jsimd_h2v1_extrgbx_merged_upsample_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct=jsimd_h2v1_extbgr_merged_upsample_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct=jsimd_h2v1_extbgrx_merged_upsample_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct=jsimd_h2v1_extxbgr_merged_upsample_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct=jsimd_h2v1_extxrgb_merged_upsample_altivec;
- break;
- default:
- altivecfct=jsimd_h2v1_merged_upsample_altivec;
- break;
- }
-
- altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
- jsimd_convsamp_altivec(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
- jsimd_fdct_islow_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
- jsimd_fdct_ifast_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
-{
- jsimd_quantize_altivec(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
- return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
-{
- return NULL;
-}
diff --git a/media/libjpeg/simd/jsimd_x86_64.c b/media/libjpeg/simd/jsimd_x86_64.c
deleted file mode 100644
index a62bcdb0ee..0000000000
--- a/media/libjpeg/simd/jsimd_x86_64.c
+++ /dev/null
@@ -1,887 +0,0 @@
-/*
- * jsimd_x86_64.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 64-bit x86 architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-/*
- * In the PIC cases, we have no guarantee that constants will keep
- * their alignment. This macro allows us to verify it at runtime.
- */
-#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0)
-
-#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
- char *env = NULL;
-
- if (simd_support != ~0U)
- return;
-
- simd_support = JSIMD_SSE2 | JSIMD_SSE;
-
- /* Force different settings through environment variables */
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = 0;
- env = getenv("JSIMD_NOHUFFENC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_extrgb_ycc_convert_sse2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_extrgbx_ycc_convert_sse2;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_extbgr_ycc_convert_sse2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_extbgrx_ycc_convert_sse2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_extxbgr_ycc_convert_sse2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_extxrgb_ycc_convert_sse2;
- break;
- default:
- sse2fct=jsimd_rgb_ycc_convert_sse2;
- break;
- }
-
- sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
- JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch(cinfo->in_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_extrgb_gray_convert_sse2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_extrgbx_gray_convert_sse2;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_extbgr_gray_convert_sse2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_extbgrx_gray_convert_sse2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_extxbgr_gray_convert_sse2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_extxrgb_gray_convert_sse2;
- break;
- default:
- sse2fct=jsimd_rgb_gray_convert_sse2;
- break;
- }
-
- sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
- void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_ycc_extrgb_convert_sse2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_ycc_extrgbx_convert_sse2;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_ycc_extbgr_convert_sse2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_ycc_extbgrx_convert_sse2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_ycc_extxbgr_convert_sse2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_ycc_extxrgb_convert_sse2;
- break;
- default:
- sse2fct=jsimd_ycc_rgb_convert_sse2;
- break;
- }
-
- sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor, compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor, compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) &&
- IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
- break;
- default:
- sse2fct=jsimd_h2v2_merged_upsample_sse2;
- break;
- }
-
- sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
- switch(cinfo->out_color_space) {
- case JCS_EXT_RGB:
- sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
- break;
- case JCS_EXT_BGR:
- sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
- break;
- default:
- sse2fct=jsimd_h2v1_merged_upsample_sse2;
- break;
- }
-
- sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(FAST_FLOAT) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
- jsimd_convsamp_sse2(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
-{
- jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(FAST_FLOAT) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
- jsimd_fdct_islow_sse2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
- jsimd_fdct_ifast_sse2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
- jsimd_fdct_float_sse(data);
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(FAST_FLOAT) != 4)
- return 0;
-
- if (simd_support & JSIMD_SSE2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
-{
- jsimd_quantize_sse2(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
-{
- jsimd_quantize_float_sse2(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(IFAST_MULT_TYPE) != 2)
- return 0;
- if (IFAST_SCALE_BITS != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
- init_simd();
-
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(FAST_FLOAT) != 4)
- return 0;
- if (sizeof(FLOAT_MULT_TYPE) != 4)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
- init_simd();
-
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
-
- if ((simd_support & JSIMD_SSE2) && simd_huffman &&
- IS_ALIGNED_SSE(jconst_huff_encode_one_block))
- return 1;
-
- return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
-{
- return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
- dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimdcpu.asm b/media/libjpeg/simd/jsimdcpu.asm
deleted file mode 100644
index 599083b182..0000000000
--- a/media/libjpeg/simd/jsimdcpu.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-; jsimdcpu.asm - SIMD instruction support check
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Check if the CPU supports SIMD instructions
-;
-; GLOBAL(unsigned int)
-; jpeg_simd_cpu_support (void)
-;
-
- align 16
- global EXTN(jpeg_simd_cpu_support)
-
-EXTN(jpeg_simd_cpu_support):
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
-; push esi ; unused
- push edi
-
- xor edi,edi ; simd support flag
-
- pushfd
- pop eax
- mov edx,eax
- xor eax, 1<<21 ; flip ID bit in EFLAGS
- push eax
- popfd
- pushfd
- pop eax
- xor eax,edx
- jz short .return ; CPUID is not supported
-
- ; Check for MMX instruction support
- xor eax,eax
- cpuid
- test eax,eax
- jz short .return
-
- xor eax,eax
- inc eax
- cpuid
- mov eax,edx ; eax = Standard feature flags
-
- test eax, 1<<23 ; bit23:MMX
- jz short .no_mmx
- or edi, byte JSIMD_MMX
-.no_mmx:
- test eax, 1<<25 ; bit25:SSE
- jz short .no_sse
- or edi, byte JSIMD_SSE
-.no_sse:
- test eax, 1<<26 ; bit26:SSE2
- jz short .no_sse2
- or edi, byte JSIMD_SSE2
-.no_sse2:
-
- ; Check for 3DNow! instruction support
- mov eax, 0x80000000
- cpuid
- cmp eax, 0x80000000
- jbe short .return
-
- mov eax, 0x80000001
- cpuid
- mov eax,edx ; eax = Extended feature flags
-
- test eax, 1<<31 ; bit31:3DNow!(vendor independent)
- jz short .no_3dnow
- or edi, byte JSIMD_3DNOW
-.no_3dnow:
-
-.return:
- mov eax,edi
-
- pop edi
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
- align 16
diff --git a/media/libjpeg/simd/jsimdext.inc b/media/libjpeg/simd/jsimdext.inc
deleted file mode 100644
index f28db60b57..0000000000
--- a/media/libjpeg/simd/jsimdext.inc
+++ /dev/null
@@ -1,375 +0,0 @@
-;
-; jsimdext.inc - common declarations
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
-;
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-;
-; This software is provided 'as-is', without any express or implied
-; warranty. In no event will the authors be held liable for any damages
-; arising from the use of this software.
-;
-; Permission is granted to anyone to use this software for any purpose,
-; including commercial applications, and to alter it and redistribute it
-; freely, subject to the following restrictions:
-;
-; 1. The origin of this software must not be misrepresented; you must not
-; claim that you wrote the original software. If you use this software
-; in a product, an acknowledgment in the product documentation would be
-; appreciated but is not required.
-; 2. Altered source versions must be plainly marked as such, and must not be
-; misrepresented as being the original software.
-; 3. This notice may not be removed or altered from any source distribution.
-;
-; [TAB8]
-
-; ==========================================================================
-; System-dependent configurations
-
-%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
-; * Microsoft Visual C++
-; * MinGW (Minimalist GNU for Windows)
-; * CygWin
-; * LCC-Win32
-
-; -- segment definition --
-;
-%ifdef __YASM_VER__
-%define SEG_TEXT .text align=16
-%define SEG_CONST .rdata align=16
-%else
-%define SEG_TEXT .text align=16 public use32 class=CODE
-%define SEG_CONST .rdata align=16 public use32 class=CONST
-%endif
-
-%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
-; * Microsoft Visual C++
-
-; -- segment definition --
-;
-%ifdef __YASM_VER__
-%define SEG_TEXT .text align=16
-%define SEG_CONST .rdata align=16
-%else
-%define SEG_TEXT .text align=16 public use64 class=CODE
-%define SEG_CONST .rdata align=16 public use64 class=CONST
-%endif
-%define EXTN(name) name ; foo() -> foo
-
-%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
-; * Borland C++ (Win32)
-
-; -- segment definition --
-;
-%define SEG_TEXT _text align=16 public use32 class=CODE
-%define SEG_CONST _data align=16 public use32 class=DATA
-
-%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
-; * Linux
-; * *BSD family Unix using elf format
-; * Unix System V, including Solaris x86, UnixWare and SCO Unix
-
-; mark stack as non-executable
-section .note.GNU-stack noalloc noexec nowrite progbits
-
-; -- segment definition --
-;
-%ifdef __x86_64__
-%define SEG_TEXT .text progbits align=16
-%define SEG_CONST .rodata progbits align=16
-%else
-%define SEG_TEXT .text progbits alloc exec nowrite align=16
-%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
-%endif
-
-; To make the code position-independent, append -DPIC to the commandline
-;
-%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
-%define EXTN(name) name ; foo() -> foo
-
-%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
-; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
-; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
-
-; -- segment definition --
-;
-%define SEG_TEXT .text
-%define SEG_CONST .data
-
-; To make the code position-independent, append -DPIC to the commandline
-;
-%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
-
-%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
-; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
-
-; -- segment definition --
-;
-%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
-%define SEG_CONST .rodata align=16
-
-; The generation of position-independent code (PIC) is the default on Darwin.
-;
-%define PIC
-%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
-
-%else ; ----(Other case)----------------------
-
-; -- segment definition --
-;
-%define SEG_TEXT .text
-%define SEG_CONST .data
-
-%endif ; ----------------------------------------------
-
-; ==========================================================================
-
-; --------------------------------------------------------------------------
-; Common types
-;
-%ifdef __x86_64__
-%define POINTER qword ; general pointer type
-%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
-%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
-%else
-%define POINTER dword ; general pointer type
-%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
-%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
-%endif
-
-%define INT dword ; signed integer type
-%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
-%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
-
-%define FP32 dword ; IEEE754 single
-%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
-%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
-
-%define MMWORD qword ; int64 (MMX register)
-%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
-%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
-
-; NASM is buggy and doesn't properly handle operand sizes for SSE
-; instructions, so for now we have to define XMMWORD as blank.
-%define XMMWORD ; int128 (SSE register)
-%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
-%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
-
-; Similar hacks for when we load a dword or MMWORD into an xmm# register
-%define XMM_DWORD
-%define XMM_MMWORD
-
-%define SIZEOF_BYTE 1 ; sizeof(BYTE)
-%define SIZEOF_WORD 2 ; sizeof(WORD)
-%define SIZEOF_DWORD 4 ; sizeof(DWORD)
-%define SIZEOF_QWORD 8 ; sizeof(QWORD)
-%define SIZEOF_OWORD 16 ; sizeof(OWORD)
-
-%define BYTE_BIT 8 ; CHAR_BIT in C
-%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
-%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
-%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
-%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
-
-; --------------------------------------------------------------------------
-; External Symbol Name
-;
-%ifndef EXTN
-%define EXTN(name) _ %+ name ; foo() -> _foo
-%endif
-
-; --------------------------------------------------------------------------
-; Macros for position-independent code (PIC) support
-;
-%ifndef GOT_SYMBOL
-%undef PIC
-%endif
-
-%ifdef PIC ; -------------------------------------------
-
-%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
-
-; At present, nasm doesn't seem to support PIC generation for Mach-O.
-; The PIC support code below is a little tricky.
-
- SECTION SEG_CONST
-const_base:
-
-%define GOTOFF(got,sym) (got) + (sym) - const_base
-
-%imacro get_GOT 1
- ; NOTE: this macro destroys ecx resister.
- call %%geteip
- add ecx, byte (%%ref - $)
- jmp short %%adjust
-%%geteip:
- mov ecx, POINTER [esp]
- ret
-%%adjust:
- push ebp
- xor ebp,ebp ; ebp = 0
-%ifidni %1,ebx ; (%1 == ebx)
- ; db 0x8D,0x9C + jmp near const_base =
- ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
- db 0x8D,0x9C ; 8D,9C
- jmp near const_base ; E9,(const_base-%%ref)
-%%ref:
-%else ; (%1 != ebx)
- ; db 0x8D,0x8C + jmp near const_base =
- ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
- db 0x8D,0x8C ; 8D,8C
- jmp near const_base ; E9,(const_base-%%ref)
-%%ref: mov %1, ecx
-%endif ; (%1 == ebx)
- pop ebp
-%endmacro
-
-%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
-
-%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
-
-%imacro get_GOT 1
- extern GOT_SYMBOL
- call %%geteip
- add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
- jmp short %%done
-%%geteip:
- mov %1, POINTER [esp]
- ret
-%%done:
-%endmacro
-
-%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
-
-%imacro pushpic 1.nolist
- push %1
-%endmacro
-%imacro poppic 1.nolist
- pop %1
-%endmacro
-%imacro movpic 2.nolist
- mov %1,%2
-%endmacro
-
-%else ; !PIC -----------------------------------------
-
-%define GOTOFF(got,sym) (sym)
-
-%imacro get_GOT 1.nolist
-%endmacro
-%imacro pushpic 1.nolist
-%endmacro
-%imacro poppic 1.nolist
-%endmacro
-%imacro movpic 2.nolist
-%endmacro
-
-%endif ; PIC -----------------------------------------
-
-; --------------------------------------------------------------------------
-; Align the next instruction on {2,4,8,16,..}-byte boundary.
-; ".balign n,,m" in GNU as
-;
-%define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
-%define FILLB(b,n) (($$-(b)) & ((n)-1))
-
-%imacro alignx 1-2.nolist 0xFFFF
-%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
- db 0x90 ; nop
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
- db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
- db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
- db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
- db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
- db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
- db 0x8B,0xED ; mov ebp,ebp
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
- db 0x90 ; nop
-%endmacro
-
-; Align the next data on {2,4,8,16,..}-byte boundary.
-;
-%imacro alignz 1.nolist
- align %1, db 0 ; filling zeros
-%endmacro
-
-%ifdef __x86_64__
-
-%ifdef WIN64
-
-%imacro collect_args 0
- push r12
- push r13
- push r14
- push r15
- mov r10, rcx
- mov r11, rdx
- mov r12, r8
- mov r13, r9
- mov r14, [rax+48]
- mov r15, [rax+56]
- push rsi
- push rdi
- sub rsp, SIZEOF_XMMWORD
- movaps XMMWORD [rsp], xmm6
- sub rsp, SIZEOF_XMMWORD
- movaps XMMWORD [rsp], xmm7
-%endmacro
-
-%imacro uncollect_args 0
- movaps xmm7, XMMWORD [rsp]
- add rsp, SIZEOF_XMMWORD
- movaps xmm6, XMMWORD [rsp]
- add rsp, SIZEOF_XMMWORD
- pop rdi
- pop rsi
- pop r15
- pop r14
- pop r13
- pop r12
-%endmacro
-
-%else
-
-%imacro collect_args 0
- push r10
- push r11
- push r12
- push r13
- push r14
- push r15
- mov r10, rdi
- mov r11, rsi
- mov r12, rdx
- mov r13, rcx
- mov r14, r8
- mov r15, r9
-%endmacro
-
-%imacro uncollect_args 0
- pop r15
- pop r14
- pop r13
- pop r12
- pop r11
- pop r10
-%endmacro
-
-%endif
-
-%endif
-
-; --------------------------------------------------------------------------
-; Defines picked up from the C headers
-;
-%include "jsimdcfg.inc"
-
-; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/mips/jsimd.c b/media/libjpeg/simd/mips/jsimd.c
new file mode 100644
index 0000000000..d2546eed32
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd.c
@@ -0,0 +1,1147 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if !(defined(__mips_dsp) && (__mips_dsp_rev >= 2)) && defined(__linux__)
+
+LOCAL(void)
+parse_proc_cpuinfo(const char *search_string)
+{
+ const char *file_name = "/proc/cpuinfo";
+ char cpuinfo_line[256];
+ FILE *f = NULL;
+
+ simd_support = 0;
+
+ if ((f = fopen(file_name, "r")) != NULL) {
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+ if (strstr(cpuinfo_line, search_string) != NULL) {
+ fclose(f);
+ simd_support |= JSIMD_DSPR2;
+ return;
+ }
+ }
+ fclose(f);
+ }
+ /* Did not find string in the proc file, or not Linux ELF. */
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char *env = NULL;
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+ simd_support |= JSIMD_DSPR2;
+#elif defined(__linux__)
+ /* We still have a chance to use MIPS DSPR2 regardless of globally used
+ * -mdspr2 options passed to gcc by performing runtime detection via
+ * /proc/cpuinfo parsing on linux */
+ parse_proc_cpuinfo("MIPS 74K");
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCEDSPR2");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = JSIMD_DSPR2;
+ env = getenv("JSIMD_FORCENONE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+#endif
+}
+
+static const int mips_idct_ifast_coefs[4] = {
+ 0x45404540, /* FIX( 1.082392200 / 2) = 17734 = 0x4546 */
+ 0x5A805A80, /* FIX( 1.414213562 / 2) = 23170 = 0x5A82 */
+ 0x76407640, /* FIX( 1.847759065 / 2) = 30274 = 0x7642 */
+ 0xAC60AC60 /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */
+};
+
+/* The following struct is borrowed from jdsample.c */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+typedef struct {
+ struct jpeg_upsampler pub;
+ JSAMPARRAY color_buf[MAX_COMPONENTS];
+ upsample1_ptr methods[MAX_COMPONENTS];
+ int next_row_out;
+ JDIMENSION rows_to_go;
+ int rowgroup_height[MAX_COMPONENTS];
+ UINT8 h_expand[MAX_COMPONENTS];
+ UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_extrgbx_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_extbgr_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_extbgrx_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_extxbgr_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_extxrgb_ycc_convert_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_extrgbx_gray_convert_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_extbgr_gray_convert_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_extbgrx_gray_convert_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_extxbgr_gray_convert_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_extxrgb_gray_convert_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_ycc_extrgbx_convert_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_ycc_extbgr_convert_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_ycc_extbgrx_convert_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_ycc_extxbgr_convert_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_ycc_extxrgb_convert_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf,
+ output_row, num_rows, cinfo->num_components);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ /* FIXME: jsimd_h2v2_downsample_dspr2() fails some of the TJBench tiling
+ * regression tests, probably because the DSPr2 SIMD implementation predates
+ * those tests. */
+#if 0
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ /* FIXME: jsimd_h2v1_downsample_dspr2() fails some of the TJBench tiling
+ * regression tests, probably because the DSPr2 SIMD implementation predates
+ * those tests. */
+#if 0
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data,
+ compptr->v_samp_factor,
+ cinfo->max_v_samp_factor,
+ cinfo->smoothing_factor,
+ compptr->width_in_blocks,
+ cinfo->image_width);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+
+ jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index],
+ upsample->v_expand[compptr->component_index],
+ input_data, output_data_ptr, cinfo->output_width,
+ cinfo->max_v_samp_factor);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+ cinfo->sample_range_limit);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+ cinfo->sample_range_limit);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+#ifndef __mips_soft_float
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+ jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+#ifndef __mips_soft_float
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+ jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+ init_simd();
+
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int workspace[DCTSIZE * 4]; /* buffers data between passes */
+
+ jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col,
+ workspace);
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int workspace[96];
+ int output[12] = {
+ (int)(output_buf[0] + output_col),
+ (int)(output_buf[1] + output_col),
+ (int)(output_buf[2] + output_col),
+ (int)(output_buf[3] + output_col),
+ (int)(output_buf[4] + output_col),
+ (int)(output_buf[5] + output_col),
+ (int)(output_buf[6] + output_col),
+ (int)(output_buf[7] + output_col),
+ (int)(output_buf[8] + output_col),
+ (int)(output_buf[9] + output_col),
+ (int)(output_buf[10] + output_col),
+ (int)(output_buf[11] + output_col)
+ };
+
+ jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace);
+ jsimd_idct_12x12_pass2_dspr2(workspace, output);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int output[8] = {
+ (int)(output_buf[0] + output_col),
+ (int)(output_buf[1] + output_col),
+ (int)(output_buf[2] + output_col),
+ (int)(output_buf[3] + output_col),
+ (int)(output_buf[4] + output_col),
+ (int)(output_buf[5] + output_col),
+ (int)(output_buf[6] + output_col),
+ (int)(output_buf[7] + output_col)
+ };
+
+ jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output,
+ IDCT_range_limit(cinfo));
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JCOEFPTR inptr;
+ IFAST_MULT_TYPE *quantptr;
+ DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
+
+ jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace,
+ mips_idct_ifast_coefs);
+
+ /* Pass 2: process rows from work array, store into output array. */
+ /* Note that we must descale the results by a factor of 8 == 2**3, */
+ /* and also undo the PASS1_BITS scaling. */
+
+ jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col,
+ mips_idct_ifast_coefs);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2.S b/media/libjpeg/simd/mips/jsimd_dspr2.S
new file mode 100644
index 0000000000..c99288a8d1
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2.S
@@ -0,0 +1,4543 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * All Rights Reserved.
+ * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
+ * Darko Laus <darko.laus@imgtec.com>
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_dspr2_asm.h"
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_c_null_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = output_buf
+ * a3 = output_row
+ * 16(sp) = num_rows
+ * 20(sp) = cinfo->num_components
+ *
+ * Null conversion for compression
+ */
+ SAVE_REGS_ON_STACK 8, s0, s1
+
+ lw t9, 24(sp) /* t9 = num_rows */
+ lw s0, 28(sp) /* s0 = cinfo->num_components */
+ andi t0, a0, 3 /* t0 = cinfo->image_width & 3 */
+ beqz t0, 4f /* no residual */
+ nop
+0:
+ addiu t9, t9, -1
+ bltz t9, 7f
+ li t1, 0
+1:
+ sll t3, t1, 2
+ lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */
+ lw t2, 0(a1) /* t2 = inptr = *input_buf */
+ sll t4, a3, 2
+ lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */
+ addu t2, t2, t1
+ addu s1, t5, a0
+ addu t6, t5, t0
+2:
+ lbu t3, 0(t2)
+ addiu t5, t5, 1
+ sb t3, -1(t5)
+ bne t6, t5, 2b
+ addu t2, t2, s0
+3:
+ lbu t3, 0(t2)
+ addu t4, t2, s0
+ addu t7, t4, s0
+ addu t8, t7, s0
+ addu t2, t8, s0
+ lbu t4, 0(t4)
+ lbu t7, 0(t7)
+ lbu t8, 0(t8)
+ addiu t5, t5, 4
+ sb t3, -4(t5)
+ sb t4, -3(t5)
+ sb t7, -2(t5)
+ bne s1, t5, 3b
+ sb t8, -1(t5)
+ addiu t1, t1, 1
+ bne t1, s0, 1b
+ nop
+ addiu a1, a1, 4
+ bgez t9, 0b
+ addiu a3, a3, 1
+ b 7f
+ nop
+4:
+ addiu t9, t9, -1
+ bltz t9, 7f
+ li t1, 0
+5:
+ sll t3, t1, 2
+ lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */
+ lw t2, 0(a1) /* t2 = inptr = *input_buf */
+ sll t4, a3, 2
+ lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */
+ addu t2, t2, t1
+ addu s1, t5, a0
+ addu t6, t5, t0
+6:
+ lbu t3, 0(t2)
+ addu t4, t2, s0
+ addu t7, t4, s0
+ addu t8, t7, s0
+ addu t2, t8, s0
+ lbu t4, 0(t4)
+ lbu t7, 0(t7)
+ lbu t8, 0(t8)
+ addiu t5, t5, 4
+ sb t3, -4(t5)
+ sb t4, -3(t5)
+ sb t7, -2(t5)
+ bne s1, t5, 6b
+ sb t8, -1(t5)
+ addiu t1, t1, 1
+ bne t1, s0, 5b
+ nop
+ addiu a1, a1, 4
+ bgez t9, 4b
+ addiu a3, a3, 1
+7:
+ RESTORE_REGS_FROM_STACK 8, s0, s1
+
+ j ra
+ nop
+
+END(jsimd_c_null_convert_dspr2)
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_dspr2
+ * jsimd_extbgr_ycc_convert_dspr2
+ * jsimd_extrgbx_ycc_convert_dspr2
+ * jsimd_extbgrx_ycc_convert_dspr2
+ * jsimd_extxbgr_ycc_convert_dspr2
+ * jsimd_extxrgb_ycc_convert_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
+ r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC r, g, b, inptr
+ lbu \r, \r_offs(\inptr)
+ lbu \g, \g_offs(\inptr)
+ lbu \b, \b_offs(\inptr)
+ addiu \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = output_buf
+ * a3 = output_row
+ * 16(sp) = num_rows
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw t7, 48(sp) /* t7 = num_rows */
+ li s0, 0x4c8b /* FIX(0.29900) */
+ li s1, 0x9646 /* FIX(0.58700) */
+ li s2, 0x1d2f /* FIX(0.11400) */
+ li s3, 0xffffd4cd /* -FIX(0.16874) */
+ li s4, 0xffffab33 /* -FIX(0.33126) */
+ li s5, 0x8000 /* FIX(0.50000) */
+ li s6, 0xffff94d1 /* -FIX(0.41869) */
+ li s7, 0xffffeb2f /* -FIX(0.08131) */
+ li t8, 0x807fff /* CBCR_OFFSET + ONE_HALF-1 */
+
+0:
+ addiu t7, -1 /* --num_rows */
+ lw t6, 0(a1) /* t6 = input_buf[0] */
+ lw t0, 0(a2)
+ lw t1, 4(a2)
+ lw t2, 8(a2)
+ sll t3, a3, 2
+ lwx t0, t3(t0) /* t0 = output_buf[0][output_row] */
+ lwx t1, t3(t1) /* t1 = output_buf[1][output_row] */
+ lwx t2, t3(t2) /* t2 = output_buf[2][output_row] */
+
+ addu t9, t2, a0 /* t9 = end address */
+ addiu a3, 1
+
+1:
+ DO_RGB_TO_YCC t3, t4, t5, t6
+
+ mtlo s5, $ac0
+ mtlo t8, $ac1
+ mtlo t8, $ac2
+ maddu $ac0, s2, t5
+ maddu $ac1, s5, t5
+ maddu $ac2, s5, t3
+ maddu $ac0, s0, t3
+ maddu $ac1, s3, t3
+ maddu $ac2, s6, t4
+ maddu $ac0, s1, t4
+ maddu $ac1, s4, t4
+ maddu $ac2, s7, t5
+ extr.w t3, $ac0, 16
+ extr.w t4, $ac1, 16
+ extr.w t5, $ac2, 16
+ sb t3, 0(t0)
+ sb t4, 0(t1)
+ sb t5, 0(t2)
+ addiu t0, 1
+ addiu t2, 1
+ bne t2, t9, 1b
+ addiu t1, 1
+ bgtz t7, 0b
+ addiu a1, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_\colorid\()_ycc_convert_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*-------------------------------------id -- pix R G B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_dspr2
+ * jsimd_ycc_extbgr_convert_dspr2
+ * jsimd_ycc_extrgbx_convert_dspr2
+ * jsimd_ycc_extbgrx_convert_dspr2
+ * jsimd_ycc_extxbgr_convert_dspr2
+ * jsimd_ycc_extxrgb_convert_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
+ r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
+ sb \scratch0, \r_offs(\outptr)
+ sb \scratch1, \g_offs(\outptr)
+ sb \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+ li t0, 0xFF
+ sb t0, \a_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = input_row
+ * a3 = output_buf
+ * 16(sp) = num_rows
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw s1, 48(sp)
+ li t3, 0x8000
+ li t4, 0x166e9 /* FIX(1.40200) */
+ li t5, 0x1c5a2 /* FIX(1.77200) */
+ li t6, 0xffff492e /* -FIX(0.71414) */
+ li t7, 0xffffa7e6 /* -FIX(0.34414) */
+ repl.ph t8, 128
+
+0:
+ lw s0, 0(a3)
+ lw t0, 0(a1)
+ lw t1, 4(a1)
+ lw t2, 8(a1)
+ sll s5, a2, 2
+ addiu s1, -1
+ lwx s2, s5(t0)
+ lwx s3, s5(t1)
+ lwx s4, s5(t2)
+ addu t9, s2, a0
+ addiu a2, 1
+
+1:
+ lbu s7, 0(s4) /* cr */
+ lbu s6, 0(s3) /* cb */
+ lbu s5, 0(s2) /* y */
+ addiu s2, 1
+ addiu s4, 1
+ addiu s7, -128
+ addiu s6, -128
+ mul t2, t7, s6
+ mul t0, t6, s7 /* Crgtab[cr] */
+ sll s7, 15
+ mulq_rs.w t1, t4, s7 /* Crrtab[cr] */
+ sll s6, 15
+ addu t2, t3 /* Cbgtab[cb] */
+ addu t2, t0
+
+ mulq_rs.w t0, t5, s6 /* Cbbtab[cb] */
+ sra t2, 16
+ addu t1, s5
+ addu t2, s5 /* add y */
+ ins t2, t1, 16, 16
+ subu.ph t2, t2, t8
+ addu t0, s5
+ shll_s.ph t2, t2, 8
+ subu t0, 128
+ shra.ph t2, t2, 8
+ shll_s.w t0, t0, 24
+ addu.ph t2, t2, t8 /* clip & store */
+ sra t0, t0, 24
+ sra t1, t2, 16
+ addiu t0, 128
+
+ STORE_YCC_TO_RGB t1, t2, t0, s0
+
+ bne s2, t9, 1b
+ addiu s3, 1
+ bgtz s1, 0b
+ addiu a3, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_ycc_\colorid\()_convert_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*-------------------------------------id -- pix R G B A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_dspr2
+ * jsimd_extbgr_gray_convert_dspr2
+ * jsimd_extrgbx_gray_convert_dspr2
+ * jsimd_extbgrx_gray_convert_dspr2
+ * jsimd_extxbgr_gray_convert_dspr2
+ * jsimd_extxrgb_gray_convert_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
+ r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY r, g, b, inptr
+ lbu \r, \r_offs(\inptr)
+ lbu \g, \g_offs(\inptr)
+ lbu \b, \b_offs(\inptr)
+ addiu \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = output_buf
+ * a3 = output_row
+ * 16(sp) = num_rows
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ li s0, 0x4c8b /* s0 = FIX(0.29900) */
+ li s1, 0x9646 /* s1 = FIX(0.58700) */
+ li s2, 0x1d2f /* s2 = FIX(0.11400) */
+ li s7, 0x8000 /* s7 = FIX(0.50000) */
+ lw s6, 48(sp)
+ andi t7, a0, 3
+
+0:
+ addiu s6, -1 /* s6 = num_rows */
+ lw t0, 0(a1)
+ lw t1, 0(a2)
+ sll t3, a3, 2
+ lwx t1, t3(t1)
+ addiu a3, 1
+ addu t9, t1, a0
+ subu t8, t9, t7
+ beq t1, t8, 2f
+ nop
+
+1:
+ DO_RGB_TO_GRAY t3, t4, t5, t0
+ DO_RGB_TO_GRAY s3, s4, s5, t0
+
+ mtlo s7, $ac0
+ maddu $ac0, s2, t5
+ maddu $ac0, s1, t4
+ maddu $ac0, s0, t3
+ mtlo s7, $ac1
+ maddu $ac1, s2, s5
+ maddu $ac1, s1, s4
+ maddu $ac1, s0, s3
+ extr.w t6, $ac0, 16
+
+ DO_RGB_TO_GRAY t3, t4, t5, t0
+ DO_RGB_TO_GRAY s3, s4, s5, t0
+
+ mtlo s7, $ac0
+ maddu $ac0, s2, t5
+ maddu $ac0, s1, t4
+ extr.w t2, $ac1, 16
+ maddu $ac0, s0, t3
+ mtlo s7, $ac1
+ maddu $ac1, s2, s5
+ maddu $ac1, s1, s4
+ maddu $ac1, s0, s3
+ extr.w t5, $ac0, 16
+ sb t6, 0(t1)
+ sb t2, 1(t1)
+ extr.w t3, $ac1, 16
+ addiu t1, 4
+ sb t5, -2(t1)
+ sb t3, -1(t1)
+ bne t1, t8, 1b
+ nop
+
+2:
+ beqz t7, 4f
+ nop
+
+3:
+ DO_RGB_TO_GRAY t3, t4, t5, t0
+
+ mtlo s7, $ac0
+ maddu $ac0, s2, t5
+ maddu $ac0, s1, t4
+ maddu $ac0, s0, t3
+ extr.w t6, $ac0, 16
+ sb t6, 0(t1)
+ addiu t1, 1
+ bne t1, t9, 3b
+ nop
+
+4:
+ bgtz s6, 0b
+ addiu a1, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_\colorid\()_gray_convert_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*-------------------------------------id -- pix R G B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_merged_upsample_dspr2
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v2 upsample routines
+ */
+.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
+ r1_offs, g1_offs, \
+ b1_offs, a1_offs, \
+ r2_offs, g2_offs, \
+ b2_offs, a2_offs
+
+.macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
+ scratch5 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+ sb \scratch3, \r2_offs(\outptr)
+ sb \scratch4, \g2_offs(\outptr)
+ sb \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+ li \scratch0, 0xFF
+ sb \scratch0, \a1_offs(\outptr)
+ sb \scratch0, \a2_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+
+.if (\pixel_size == 8)
+ li t0, 0xFF
+ sb t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0 = cinfo->output_width
+ * a1 = input_buf
+ * a2 = in_row_group_ctr
+ * a3 = output_buf
+ * 16(sp) = cinfo->sample_range_limit
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ lw t9, 56(sp) /* cinfo->sample_range_limit */
+ lw v0, 0(a1)
+ lw v1, 4(a1)
+ lw t0, 8(a1)
+ sll t1, a2, 3
+ addiu t2, t1, 4
+ sll t3, a2, 2
+ lw t4, 0(a3) /* t4 = output_buf[0] */
+ lwx t1, t1(v0) /* t1 = input_buf[0][in_row_group_ctr*2] */
+ lwx t2, t2(v0) /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */
+ lwx t5, t3(v1) /* t5 = input_buf[1][in_row_group_ctr] */
+ lwx t6, t3(t0) /* t6 = input_buf[2][in_row_group_ctr] */
+ lw t7, 4(a3) /* t7 = output_buf[1] */
+ li s1, 0xe6ea
+ addiu t8, s1, 0x7fff /* t8 = 0x166e9 [FIX(1.40200)] */
+ addiu s0, t8, 0x5eb9 /* s0 = 0x1c5a2 [FIX(1.77200)] */
+ addiu s1, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+ xori s2, s1, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */
+ srl t3, a0, 1
+ blez t3, 2f
+ addu t0, t5, t3 /* t0 = end address */
+ 1:
+ lbu t3, 0(t5)
+ lbu s3, 0(t6)
+ addiu t5, t5, 1
+ addiu t3, t3, -128 /* (cb - 128) */
+ addiu s3, s3, -128 /* (cr - 128) */
+ mult $ac1, s1, t3
+ madd $ac1, s2, s3
+ sll s3, s3, 15
+ sll t3, t3, 15
+ mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+ extr_r.w s5, $ac1, 16
+ mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+ lbu v0, 0(t1)
+ addiu t6, t6, 1
+ addiu t1, t1, 2
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu AT, 0(t3)
+ lbu s7, 0(s3)
+ lbu ra, 0(v1)
+ lbu v0, -1(t1)
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+ lbu v0, 0(t2)
+
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
+
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu AT, 0(t3)
+ lbu s7, 0(s3)
+ lbu ra, 0(v1)
+ lbu v0, 1(t2)
+ addiu t2, t2, 2
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
+
+ bne t0, t5, 1b
+ nop
+2:
+ andi t0, a0, 1
+ beqz t0, 4f
+ lbu t3, 0(t5)
+ lbu s3, 0(t6)
+ addiu t3, t3, -128 /* (cb - 128) */
+ addiu s3, s3, -128 /* (cr - 128) */
+ mult $ac1, s1, t3
+ madd $ac1, s2, s3
+ sll s3, s3, 15
+ sll t3, t3, 15
+ lbu v0, 0(t1)
+ extr_r.w s5, $ac1, 16
+ mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+ mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+ lbu v0, 0(t2)
+
+ STORE_H2V2_1_PIXEL t3, s3, v1, t4
+
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+
+ STORE_H2V2_1_PIXEL t3, s3, v1, t7
+4:
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ j ra
+ nop
+
+END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V2_1_PIXEL
+.purgem STORE_H2V2_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v1_merged_upsample_dspr2
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v1 upsample routines
+ */
+
+.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
+ r1_offs, g1_offs, \
+ b1_offs, a1_offs, \
+ r2_offs, g2_offs, \
+ b2_offs, a2_offs
+
+.macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
+ scratch5 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+ sb \scratch3, \r2_offs(\outptr)
+ sb \scratch4, \g2_offs(\outptr)
+ sb \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+ li t0, 0xFF
+ sb t0, \a1_offs(\outptr)
+ sb t0, \a2_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+.if (\pixel_size == 8)
+ li t0, 0xFF
+ sb t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0 = cinfo->output_width
+ * a1 = input_buf
+ * a2 = in_row_group_ctr
+ * a3 = output_buf
+ * 16(sp) = range_limit
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ li t0, 0xe6ea
+ lw t1, 0(a1) /* t1 = input_buf[0] */
+ lw t2, 4(a1) /* t2 = input_buf[1] */
+ lw t3, 8(a1) /* t3 = input_buf[2] */
+ lw t8, 56(sp) /* t8 = range_limit */
+ addiu s1, t0, 0x7fff /* s1 = 0x166e9 [FIX(1.40200)] */
+ addiu s2, s1, 0x5eb9 /* s2 = 0x1c5a2 [FIX(1.77200)] */
+ addiu s0, t0, 0x9916 /* s0 = 0x8000 */
+ addiu s4, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+ xori s3, s4, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */
+ srl t0, a0, 1
+ sll t4, a2, 2
+ lwx s5, t4(t1) /* s5 = inptr0 */
+ lwx s6, t4(t2) /* s6 = inptr1 */
+ lwx s7, t4(t3) /* s7 = inptr2 */
+ lw t7, 0(a3) /* t7 = outptr */
+ blez t0, 2f
+ addu t9, s6, t0 /* t9 = end address */
+1:
+ lbu t2, 0(s6) /* t2 = cb */
+ lbu t0, 0(s7) /* t0 = cr */
+ lbu t1, 0(s5) /* t1 = y */
+ addiu t2, t2, -128 /* t2 = cb - 128 */
+ addiu t0, t0, -128 /* t0 = cr - 128 */
+ mult $ac1, s4, t2
+ madd $ac1, s3, t0
+ sll t0, t0, 15
+ sll t2, t2, 15
+ mulq_rs.w t0, s1, t0 /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */
+ extr_r.w t5, $ac1, 16
+ mulq_rs.w t6, s2, t2 /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */
+ addiu s7, s7, 1
+ addiu s6, s6, 1
+ addu t2, t1, t0 /* t2 = y + cred */
+ addu t3, t1, t5 /* t3 = y + cgreen */
+ addu t4, t1, t6 /* t4 = y + cblue */
+ addu t2, t8, t2
+ addu t3, t8, t3
+ addu t4, t8, t4
+ lbu t1, 1(s5)
+ lbu v0, 0(t2)
+ lbu v1, 0(t3)
+ lbu ra, 0(t4)
+ addu t2, t1, t0
+ addu t3, t1, t5
+ addu t4, t1, t6
+ addu t2, t8, t2
+ addu t3, t8, t3
+ addu t4, t8, t4
+ lbu t2, 0(t2)
+ lbu t3, 0(t3)
+ lbu t4, 0(t4)
+
+ STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
+
+ bne t9, s6, 1b
+ addiu s5, s5, 2
+2:
+ andi t0, a0, 1
+ beqz t0, 4f
+ nop
+3:
+ lbu t2, 0(s6)
+ lbu t0, 0(s7)
+ lbu t1, 0(s5)
+ addiu t2, t2, -128 /* (cb - 128) */
+ addiu t0, t0, -128 /* (cr - 128) */
+ mul t3, s4, t2
+ mul t4, s3, t0
+ sll t0, t0, 15
+ sll t2, t2, 15
+ mulq_rs.w t0, s1, t0 /* (C1*cr + ONE_HALF)>> SCALEBITS */
+ mulq_rs.w t6, s2, t2 /* (C2*cb + ONE_HALF)>> SCALEBITS */
+ addu t3, t3, s0
+ addu t3, t4, t3
+ sra t5, t3, 16 /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */
+ addu t2, t1, t0 /* y + cred */
+ addu t3, t1, t5 /* y + cgreen */
+ addu t4, t1, t6 /* y + cblue */
+ addu t2, t8, t2
+ addu t3, t8, t3
+ addu t4, t8, t4
+ lbu t2, 0(t2)
+ lbu t3, 0(t3)
+ lbu t4, 0(t4)
+
+ STORE_H2V1_1_PIXEL t2, t3, t4, t7
+4:
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ j ra
+ nop
+
+END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V1_1_PIXEL
+.purgem STORE_H2V1_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+ li s4, 0
+ lw s2, 0(a3) /* s2 = *output_data_ptr */
+0:
+ li t9, 2
+ lw s1, -4(a2) /* s1 = inptr1 */
+
+1:
+ lw s0, 0(a2) /* s0 = inptr0 */
+ lwx s3, s4(s2)
+ addiu s5, a1, -2 /* s5 = downsampled_width - 2 */
+ srl t4, s5, 1
+ sll t4, t4, 1
+ lbu t0, 0(s0)
+ lbu t1, 1(s0)
+ lbu t2, 0(s1)
+ lbu t3, 1(s1)
+ addiu s0, 2
+ addiu s1, 2
+ addu t8, s0, t4 /* t8 = end address */
+ andi s5, s5, 1 /* s5 = residual */
+ sll t4, t0, 1
+ sll t6, t1, 1
+ addu t0, t0, t4 /* t0 = (*inptr0++) * 3 */
+ addu t1, t1, t6 /* t1 = (*inptr0++) * 3 */
+ addu t7, t0, t2 /* t7 = thiscolsum */
+ addu t6, t1, t3 /* t5 = nextcolsum */
+ sll t0, t7, 2 /* t0 = thiscolsum * 4 */
+ subu t1, t0, t7 /* t1 = thiscolsum * 3 */
+ shra_r.w t0, t0, 4
+ addiu t1, 7
+ addu t1, t1, t6
+ srl t1, t1, 4
+ sb t0, 0(s3)
+ sb t1, 1(s3)
+ beq t8, s0, 22f /* skip to final iteration if width == 3 */
+ addiu s3, 2
+2:
+ lh t0, 0(s0) /* t0 = A3|A2 */
+ lh t2, 0(s1) /* t2 = B3|B2 */
+ addiu s0, 2
+ addiu s1, 2
+ preceu.ph.qbr t0, t0 /* t0 = 0|A3|0|A2 */
+ preceu.ph.qbr t2, t2 /* t2 = 0|B3|0|B2 */
+ shll.ph t1, t0, 1
+ sll t3, t6, 1
+ addu.ph t0, t1, t0 /* t0 = A3*3|A2*3 */
+ addu t3, t3, t6 /* t3 = this * 3 */
+ addu.ph t0, t0, t2 /* t0 = next2|next1 */
+ addu t1, t3, t7
+ andi t7, t0, 0xFFFF /* t7 = next1 */
+ sll t2, t7, 1
+ addu t2, t7, t2 /* t2 = next1*3 */
+ addu t4, t2, t6
+ srl t6, t0, 16 /* t6 = next2 */
+ shra_r.w t1, t1, 4 /* t1 = (this*3 + last + 8) >> 4 */
+ addu t0, t3, t7
+ addiu t0, 7
+ srl t0, t0, 4 /* t0 = (this*3 + next1 + 7) >> 4 */
+ shra_r.w t4, t4, 4 /* t3 = (next1*3 + this + 8) >> 4 */
+ addu t2, t2, t6
+ addiu t2, 7
+ srl t2, t2, 4 /* t2 = (next1*3 + next2 + 7) >> 4 */
+ sb t1, 0(s3)
+ sb t0, 1(s3)
+ sb t4, 2(s3)
+ sb t2, 3(s3)
+ bne t8, s0, 2b
+ addiu s3, 4
+22:
+ beqz s5, 4f
+ addu t8, s0, s5
+3:
+ lbu t0, 0(s0)
+ lbu t2, 0(s1)
+ addiu s0, 1
+ addiu s1, 1
+ sll t3, t6, 1
+ sll t1, t0, 1
+ addu t1, t0, t1 /* t1 = inptr0 * 3 */
+ addu t3, t3, t6 /* t3 = thiscolsum * 3 */
+ addu t5, t1, t2
+ addu t1, t3, t7
+ shra_r.w t1, t1, 4
+ addu t0, t3, t5
+ addiu t0, 7
+ srl t0, t0, 4
+ sb t1, 0(s3)
+ sb t0, 1(s3)
+ addiu s3, 2
+ move t7, t6
+ bne t8, s0, 3b
+ move t6, t5
+4:
+ sll t0, t6, 2 /* t0 = thiscolsum * 4 */
+ subu t1, t0, t6 /* t1 = thiscolsum * 3 */
+ addu t1, t1, t7
+ addiu s4, 4
+ shra_r.w t1, t1, 4
+ addiu t0, 7
+ srl t0, t0, 4
+ sb t1, 0(s3)
+ sb t0, 1(s3)
+ addiu t9, -1
+ addiu s3, 2
+ bnez t9, 1b
+ lw s1, 4(a2)
+ srl t0, s4, 2
+ subu t0, a0, t0
+ bgtz t0, 0b
+ addiu a2, 4
+
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+ j ra
+ nop
+END(jsimd_h2v2_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ .set at
+
+ beqz a0, 3f
+ sll t0, a0, 2
+ lw s1, 0(a3)
+ li s3, 0x10001
+ addu s0, s1, t0
+0:
+ addiu t8, a1, -2
+ srl t9, t8, 2
+ lw t7, 0(a2)
+ lw s2, 0(s1)
+ lbu t0, 0(t7)
+ lbu t1, 1(t7) /* t1 = inptr[1] */
+ sll t2, t0, 1
+ addu t2, t2, t0 /* t2 = invalue*3 */
+ addu t2, t2, t1
+ shra_r.w t2, t2, 2
+ sb t0, 0(s2)
+ sb t2, 1(s2)
+ beqz t9, 11f
+ addiu s2, 2
+1:
+ ulw t0, 0(t7) /* t0 = |P3|P2|P1|P0| */
+ ulw t1, 1(t7)
+ ulh t2, 4(t7) /* t2 = |0|0|P5|P4| */
+ preceu.ph.qbl t3, t0 /* t3 = |0|P3|0|P2| */
+ preceu.ph.qbr t0, t0 /* t0 = |0|P1|0|P0| */
+ preceu.ph.qbr t2, t2 /* t2 = |0|P5|0|P4| */
+ preceu.ph.qbl t4, t1 /* t4 = |0|P4|0|P3| */
+ preceu.ph.qbr t1, t1 /* t1 = |0|P2|0|P1| */
+ shll.ph t5, t4, 1
+ shll.ph t6, t1, 1
+ addu.ph t5, t5, t4 /* t5 = |P4*3|P3*3| */
+ addu.ph t6, t6, t1 /* t6 = |P2*3|P1*3| */
+ addu.ph t4, t3, s3
+ addu.ph t0, t0, s3
+ addu.ph t4, t4, t5
+ addu.ph t0, t0, t6
+ shrl.ph t4, t4, 2 /* t4 = |0|P3|0|P2| */
+ shrl.ph t0, t0, 2 /* t0 = |0|P1|0|P0| */
+ addu.ph t2, t2, t5
+ addu.ph t3, t3, t6
+ shra_r.ph t2, t2, 2 /* t2 = |0|P5|0|P4| */
+ shra_r.ph t3, t3, 2 /* t3 = |0|P3|0|P2| */
+ shll.ph t2, t2, 8
+ shll.ph t3, t3, 8
+ or t2, t4, t2
+ or t3, t3, t0
+ addiu t9, -1
+ usw t3, 0(s2)
+ usw t2, 4(s2)
+ addiu s2, 8
+ bgtz t9, 1b
+ addiu t7, 4
+11:
+ andi t8, 3
+ beqz t8, 22f
+ addiu t7, 1
+
+2:
+ lbu t0, 0(t7)
+ addiu t7, 1
+ sll t1, t0, 1
+ addu t2, t0, t1 /* t2 = invalue */
+ lbu t3, -2(t7)
+ lbu t4, 0(t7)
+ addiu t3, 1
+ addiu t4, 2
+ addu t3, t3, t2
+ addu t4, t4, t2
+ srl t3, 2
+ srl t4, 2
+ sb t3, 0(s2)
+ sb t4, 1(s2)
+ addiu t8, -1
+ bgtz t8, 2b
+ addiu s2, 2
+
+22:
+ lbu t0, 0(t7)
+ lbu t2, -1(t7)
+ sll t1, t0, 1
+ addu t1, t1, t0 /* t1 = invalue * 3 */
+ addu t1, t1, t2
+ addiu t1, 1
+ srl t1, t1, 2
+ sb t1, 0(s2)
+ sb t0, 1(s2)
+ addiu s1, 4
+ bne s1, s0, 0b
+ addiu a2, 4
+3:
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ j ra
+ nop
+END(jsimd_h2v1_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = cinfo->max_v_samp_factor
+ * a2 = compptr->v_samp_factor
+ * a3 = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+ beqz a2, 7f
+ lw s1, 44(sp) /* s1 = output_data */
+ lw s0, 40(sp) /* s0 = input_data */
+ srl s2, a0, 2
+ andi t9, a0, 2
+ srl t7, t9, 1
+ addu s2, t7, s2
+ sll t0, a3, 3 /* t0 = width_in_blocks*DCT */
+ srl t7, t0, 1
+ subu s2, t7, s2
+0:
+ andi t6, a0, 1 /* t6 = temp_index */
+ addiu t6, -1
+ lw t4, 0(s1) /* t4 = outptr */
+ lw t5, 0(s0) /* t5 = inptr0 */
+ li s3, 0 /* s3 = bias */
+ srl t7, a0, 1 /* t7 = image_width1 */
+ srl s4, t7, 2
+ andi t8, t7, 3
+1:
+ ulhu t0, 0(t5)
+ ulhu t1, 2(t5)
+ ulhu t2, 4(t5)
+ ulhu t3, 6(t5)
+ raddu.w.qb t0, t0
+ raddu.w.qb t1, t1
+ raddu.w.qb t2, t2
+ raddu.w.qb t3, t3
+ shra.ph t0, t0, 1
+ shra_r.ph t1, t1, 1
+ shra.ph t2, t2, 1
+ shra_r.ph t3, t3, 1
+ sb t0, 0(t4)
+ sb t1, 1(t4)
+ sb t2, 2(t4)
+ sb t3, 3(t4)
+ addiu s4, -1
+ addiu t4, 4
+ bgtz s4, 1b
+ addiu t5, 8
+ beqz t8, 3f
+ addu s4, t4, t8
+2:
+ ulhu t0, 0(t5)
+ raddu.w.qb t0, t0
+ addqh.w t0, t0, s3
+ xori s3, s3, 1
+ sb t0, 0(t4)
+ addiu t4, 1
+ bne t4, s4, 2b
+ addiu t5, 2
+3:
+ lbux t1, t6(t5)
+ sll t1, 1
+ addqh.w t2, t1, s3 /* t2 = pixval1 */
+ xori s3, s3, 1
+ addqh.w t3, t1, s3 /* t3 = pixval2 */
+ blez s2, 5f
+ append t3, t2, 8
+ addu t5, t4, s2 /* t5 = loop_end2 */
+4:
+ ush t3, 0(t4)
+ addiu s2, -1
+ bgtz s2, 4b
+ addiu t4, 2
+5:
+ beqz t9, 6f
+ nop
+ sb t2, 0(t4)
+6:
+ addiu s1, 4
+ addiu a2, -1
+ bnez a2, 0b
+ addiu s0, 4
+7:
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+ j ra
+ nop
+END(jsimd_h2v1_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = cinfo->max_v_samp_factor
+ * a2 = compptr->v_samp_factor
+ * a3 = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ beqz a2, 8f
+ lw s1, 52(sp) /* s1 = output_data */
+ lw s0, 48(sp) /* s0 = input_data */
+
+ andi t6, a0, 1 /* t6 = temp_index */
+ addiu t6, -1
+ srl t7, a0, 1 /* t7 = image_width1 */
+ srl s4, t7, 2
+ andi t8, t7, 3
+ andi t9, a0, 2
+ srl s2, a0, 2
+ srl t7, t9, 1
+ addu s2, t7, s2
+ sll t0, a3, 3 /* s2 = width_in_blocks*DCT */
+ srl t7, t0, 1
+ subu s2, t7, s2
+0:
+ lw t4, 0(s1) /* t4 = outptr */
+ lw t5, 0(s0) /* t5 = inptr0 */
+ lw s7, 4(s0) /* s7 = inptr1 */
+ li s6, 1 /* s6 = bias */
+2:
+ ulw t0, 0(t5) /* t0 = |P3|P2|P1|P0| */
+ ulw t1, 0(s7) /* t1 = |Q3|Q2|Q1|Q0| */
+ ulw t2, 4(t5)
+ ulw t3, 4(s7)
+ precrq.ph.w t7, t0, t1 /* t2 = |P3|P2|Q3|Q2| */
+ ins t0, t1, 16, 16 /* t0 = |Q1|Q0|P1|P0| */
+ raddu.w.qb t1, t7
+ raddu.w.qb t0, t0
+ shra_r.w t1, t1, 2
+ addiu t0, 1
+ srl t0, 2
+ precrq.ph.w t7, t2, t3
+ ins t2, t3, 16, 16
+ raddu.w.qb t7, t7
+ raddu.w.qb t2, t2
+ shra_r.w t7, t7, 2
+ addiu t2, 1
+ srl t2, 2
+ sb t0, 0(t4)
+ sb t1, 1(t4)
+ sb t2, 2(t4)
+ sb t7, 3(t4)
+ addiu t4, 4
+ addiu t5, 8
+ addiu s4, s4, -1
+ bgtz s4, 2b
+ addiu s7, 8
+ beqz t8, 4f
+ addu t8, t4, t8
+3:
+ ulhu t0, 0(t5)
+ ulhu t1, 0(s7)
+ ins t0, t1, 16, 16
+ raddu.w.qb t0, t0
+ addu t0, t0, s6
+ srl t0, 2
+ xori s6, s6, 3
+ sb t0, 0(t4)
+ addiu t5, 2
+ addiu t4, 1
+ bne t8, t4, 3b
+ addiu s7, 2
+4:
+ lbux t1, t6(t5)
+ sll t1, 1
+ lbux t0, t6(s7)
+ sll t0, 1
+ addu t1, t1, t0
+ addu t3, t1, s6
+ srl t0, t3, 2 /* t2 = pixval1 */
+ xori s6, s6, 3
+ addu t2, t1, s6
+ srl t1, t2, 2 /* t3 = pixval2 */
+ blez s2, 6f
+ append t1, t0, 8
+5:
+ ush t1, 0(t4)
+ addiu s2, -1
+ bgtz s2, 5b
+ addiu t4, 2
+6:
+ beqz t9, 7f
+ nop
+ sb t0, 0(t4)
+7:
+ addiu s1, 4
+ addiu a2, -1
+ bnez a2, 0b
+ addiu s0, 8
+8:
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_h2v2_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
+/*
+ * a0 = input_data
+ * a1 = output_data
+ * a2 = compptr->v_samp_factor
+ * a3 = cinfo->max_v_samp_factor
+ * 16(sp) = cinfo->smoothing_factor
+ * 20(sp) = compptr->width_in_blocks
+ * 24(sp) = cinfo->image_width
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw s7, 52(sp) /* compptr->width_in_blocks */
+ lw s0, 56(sp) /* cinfo->image_width */
+ lw s6, 48(sp) /* cinfo->smoothing_factor */
+ sll s7, 3 /* output_cols = width_in_blocks * DCTSIZE */
+ sll v0, s7, 1
+ subu v0, v0, s0
+ blez v0, 2f
+ move v1, zero
+ addiu t0, a3, 2 /* t0 = cinfo->max_v_samp_factor + 2 */
+0:
+ addiu t1, a0, -4
+ sll t2, v1, 2
+ lwx t1, t2(t1)
+ move t3, v0
+ addu t1, t1, s0
+ lbu t2, -1(t1)
+1:
+ addiu t3, t3, -1
+ sb t2, 0(t1)
+ bgtz t3, 1b
+ addiu t1, t1, 1
+ addiu v1, v1, 1
+ bne v1, t0, 0b
+ nop
+2:
+ li v0, 80
+ mul v0, s6, v0
+ li v1, 16384
+ move t4, zero
+ move t5, zero
+ subu t6, v1, v0 /* t6 = 16384 - tmp_smoot_f * 80 */
+ sll t7, s6, 4 /* t7 = tmp_smoot_f * 16 */
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+ sll v0, t4, 2
+ lwx t8, v0(a1) /* outptr = output_data[outrow] */
+ sll v1, t5, 2
+ addiu t9, v1, 4
+ addiu s0, v1, -4
+ addiu s1, v1, 8
+ lwx s2, v1(a0) /* inptr0 = input_data[inrow] */
+ lwx t9, t9(a0) /* inptr1 = input_data[inrow+1] */
+ lwx s0, s0(a0) /* above_ptr = input_data[inrow-1] */
+ lwx s1, s1(a0) /* below_ptr = input_data[inrow+2] */
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, 0(s2)
+ lbu v1, 2(s2)
+ lbu t0, 0(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, 0(s0)
+ lbu t0, 0(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w v0, $ac1, 16
+ addiu t8, t8, 1
+ addiu s2, s2, 2
+ addiu t9, t9, 2
+ addiu s0, s0, 2
+ addiu s1, s1, 2
+ sb v0, -1(t8)
+ addiu s4, s7, -2
+ and s4, s4, 3
+ addu s5, s4, t8 /* end address */
+4:
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 2(s2)
+ lbu t0, -1(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ addiu t8, t8, 1
+ addiu s2, s2, 2
+ addiu t9, t9, 2
+ addiu s0, s0, 2
+ sb t2, -1(t8)
+ bne s5, t8, 4b
+ addiu s1, s1, 2
+ addiu s5, s7, -2
+ subu s5, s5, s4
+ addu s5, s5, t8 /* end address */
+5:
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 2(s2)
+ lbu t0, -1(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ lh v1, 2(t9)
+ addu t0, t0, v0
+ lh v0, 2(s2)
+ addu s3, t0, s3
+ lh t0, 2(s0)
+ lh t1, 2(s1)
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 4(s2)
+ lbu t0, 1(t9)
+ lbu t1, 4(t9)
+ sb t2, 0(t8)
+ raddu.w.qb t3, v0
+ lbu v0, 1(s2)
+ addu t0, t0, t1
+ mult $ac1, t3, t6
+ addu v0, v0, v1
+ lbu t2, 4(s0)
+ addu t0, t0, v0
+ lbu v0, 1(s0)
+ addu s3, t0, s3
+ lbu t0, 1(s1)
+ lbu t3, 4(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ lh v1, 4(t9)
+ addu t0, t0, v0
+ lh v0, 4(s2)
+ addu s3, t0, s3
+ lh t0, 4(s0)
+ lh t1, 4(s1)
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 6(s2)
+ lbu t0, 3(t9)
+ lbu t1, 6(t9)
+ sb t2, 1(t8)
+ raddu.w.qb t3, v0
+ lbu v0, 3(s2)
+ addu t0, t0, t1
+ mult $ac1, t3, t6
+ addu v0, v0, v1
+ lbu t2, 6(s0)
+ addu t0, t0, v0
+ lbu v0, 3(s0)
+ addu s3, t0, s3
+ lbu t0, 3(s1)
+ lbu t3, 6(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ lh v1, 6(t9)
+ addu t0, t0, v0
+ lh v0, 6(s2)
+ addu s3, t0, s3
+ lh t0, 6(s0)
+ lh t1, 6(s1)
+ madd $ac1, s3, t7
+ extr_r.w t3, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 8(s2)
+ lbu t0, 5(t9)
+ lbu t1, 8(t9)
+ sb t3, 2(t8)
+ raddu.w.qb t2, v0
+ lbu v0, 5(s2)
+ addu t0, t0, t1
+ mult $ac1, t2, t6
+ addu v0, v0, v1
+ lbu t2, 8(s0)
+ addu t0, t0, v0
+ lbu v0, 5(s0)
+ addu s3, t0, s3
+ lbu t0, 5(s1)
+ lbu t3, 8(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ addiu t8, t8, 4
+ addu t0, t0, v0
+ addiu s2, s2, 8
+ addu s3, t0, s3
+ addiu t9, t9, 8
+ madd $ac1, s3, t7
+ extr_r.w t1, $ac1, 16
+ addiu s0, s0, 8
+ addiu s1, s1, 8
+ bne s5, t8, 5b
+ sb t1, -1(t8)
+/* Special case for last column */
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 1(s2)
+ lbu t0, -1(t9)
+ lbu t1, 1(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 1(s0)
+ addu t0, t0, v0
+ lbu t3, 1(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w t0, $ac1, 16
+ addiu t5, t5, 2
+ sb t0, 0(t8)
+ addiu t4, t4, 1
+ bne t4, a2, 3b
+ addiu t5, t5, 2
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_h2v2_smooth_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_int_upsample_dspr2)
+/*
+ * a0 = upsample->h_expand[compptr->component_index]
+ * a1 = upsample->v_expand[compptr->component_index]
+ * a2 = input_data
+ * a3 = output_data_ptr
+ * 16(sp) = cinfo->output_width
+ * 20(sp) = cinfo->max_v_samp_factor
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ lw s0, 0(a3) /* s0 = output_data */
+ lw s1, 32(sp) /* s1 = cinfo->output_width */
+ lw s2, 36(sp) /* s2 = cinfo->max_v_samp_factor */
+ li t6, 0 /* t6 = inrow */
+ beqz s2, 10f
+ li s3, 0 /* s3 = outrow */
+0:
+ addu t0, a2, t6
+ addu t7, s0, s3
+ lw t3, 0(t0) /* t3 = inptr */
+ lw t8, 0(t7) /* t8 = outptr */
+ beqz s1, 4f
+ addu t5, t8, s1 /* t5 = outend */
+1:
+ lb t2, 0(t3) /* t2 = invalue = *inptr++ */
+ addiu t3, 1
+ beqz a0, 3f
+ move t0, a0 /* t0 = h_expand */
+2:
+ sb t2, 0(t8)
+ addiu t0, -1
+ bgtz t0, 2b
+ addiu t8, 1
+3:
+ bgt t5, t8, 1b
+ nop
+4:
+ addiu t9, a1, -1 /* t9 = v_expand - 1 */
+ blez t9, 9f
+ nop
+5:
+ lw t3, 0(s0)
+ lw t4, 4(s0)
+ subu t0, s1, 0xF
+ blez t0, 7f
+ addu t5, t3, s1 /* t5 = end address */
+ andi t7, s1, 0xF /* t7 = residual */
+ subu t8, t5, t7
+6:
+ ulw t0, 0(t3)
+ ulw t1, 4(t3)
+ ulw t2, 8(t3)
+ usw t0, 0(t4)
+ ulw t0, 12(t3)
+ usw t1, 4(t4)
+ usw t2, 8(t4)
+ usw t0, 12(t4)
+ addiu t3, 16
+ bne t3, t8, 6b
+ addiu t4, 16
+ beqz t7, 8f
+ nop
+7:
+ lbu t0, 0(t3)
+ sb t0, 0(t4)
+ addiu t3, 1
+ bne t3, t5, 7b
+ addiu t4, 1
+8:
+ addiu t9, -1
+ bgtz t9, 5b
+ addiu s0, 8
+9:
+ addu s3, s3, a1
+ bne s3, s2, 0b
+ addiu t6, 1
+10:
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ j ra
+ nop
+END(jsimd_int_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ lw t7, 0(a3) /* t7 = output_data */
+ andi t8, a1, 0xf /* t8 = residual */
+ sll t0, a0, 2
+ blez a0, 4f
+ addu t9, t7, t0 /* t9 = output_data end address */
+0:
+ lw t5, 0(t7) /* t5 = outptr */
+ lw t6, 0(a2) /* t6 = inptr */
+ addu t3, t5, a1 /* t3 = outptr + output_width (end address) */
+ subu t3, t8 /* t3 = end address - residual */
+ beq t5, t3, 2f
+ move t4, t8
+1:
+ ulw t0, 0(t6) /* t0 = |P3|P2|P1|P0| */
+ ulw t2, 4(t6) /* t2 = |P7|P6|P5|P4| */
+ srl t1, t0, 16 /* t1 = |X|X|P3|P2| */
+ ins t0, t0, 16, 16 /* t0 = |P1|P0|P1|P0| */
+ ins t1, t1, 16, 16 /* t1 = |P3|P2|P3|P2| */
+ ins t0, t0, 8, 16 /* t0 = |P1|P1|P0|P0| */
+ ins t1, t1, 8, 16 /* t1 = |P3|P3|P2|P2| */
+ usw t0, 0(t5)
+ usw t1, 4(t5)
+ srl t0, t2, 16 /* t0 = |X|X|P7|P6| */
+ ins t2, t2, 16, 16 /* t2 = |P5|P4|P5|P4| */
+ ins t0, t0, 16, 16 /* t0 = |P7|P6|P7|P6| */
+ ins t2, t2, 8, 16 /* t2 = |P5|P5|P4|P4| */
+ ins t0, t0, 8, 16 /* t0 = |P7|P7|P6|P6| */
+ usw t2, 8(t5)
+ usw t0, 12(t5)
+ addiu t5, 16
+ bne t5, t3, 1b
+ addiu t6, 8
+ beqz t8, 3f
+ move t4, t8
+2:
+ lbu t1, 0(t6)
+ sb t1, 0(t5)
+ sb t1, 1(t5)
+ addiu t4, -2
+ addiu t6, 1
+ bgtz t4, 2b
+ addiu t5, 2
+3:
+ addiu t7, 4
+ bne t9, t7, 0b
+ addiu a2, 4
+4:
+ j ra
+ nop
+END(jsimd_h2v1_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ lw t7, 0(a3)
+ blez a0, 7f
+ andi t9, a1, 0xf /* t9 = residual */
+0:
+ lw t6, 0(a2) /* t6 = inptr */
+ lw t5, 0(t7) /* t5 = outptr */
+ addu t8, t5, a1 /* t8 = outptr end address */
+ subu t8, t9 /* t8 = end address - residual */
+ beq t5, t8, 2f
+ move t4, t9
+1:
+ ulw t0, 0(t6)
+ srl t1, t0, 16
+ ins t0, t0, 16, 16
+ ins t0, t0, 8, 16
+ ins t1, t1, 16, 16
+ ins t1, t1, 8, 16
+ ulw t2, 4(t6)
+ usw t0, 0(t5)
+ usw t1, 4(t5)
+ srl t3, t2, 16
+ ins t2, t2, 16, 16
+ ins t2, t2, 8, 16
+ ins t3, t3, 16, 16
+ ins t3, t3, 8, 16
+ usw t2, 8(t5)
+ usw t3, 12(t5)
+ addiu t5, 16
+ bne t5, t8, 1b
+ addiu t6, 8
+ beqz t9, 3f
+ move t4, t9
+2:
+ lbu t0, 0(t6)
+ sb t0, 0(t5)
+ sb t0, 1(t5)
+ addiu t4, -2
+ addiu t6, 1
+ bgtz t4, 2b
+ addiu t5, 2
+3:
+ lw t6, 0(t7) /* t6 = outptr[0] */
+ lw t5, 4(t7) /* t5 = outptr[1] */
+ addu t4, t6, a1 /* t4 = new end address */
+ beq a1, t9, 5f
+ subu t8, t4, t9
+4:
+ ulw t0, 0(t6)
+ ulw t1, 4(t6)
+ ulw t2, 8(t6)
+ usw t0, 0(t5)
+ ulw t0, 12(t6)
+ usw t1, 4(t5)
+ usw t2, 8(t5)
+ usw t0, 12(t5)
+ addiu t6, 16
+ bne t6, t8, 4b
+ addiu t5, 16
+ beqz t9, 6f
+ nop
+5:
+ lbu t0, 0(t6)
+ sb t0, 0(t5)
+ addiu t6, 1
+ bne t6, t4, 5b
+ addiu t5, 1
+6:
+ addiu t7, 8
+ addiu a0, -2
+ bgtz a0, 0b
+ addiu a2, 4
+7:
+ j ra
+ nop
+END(jsimd_h2v2_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_islow_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = compptr->dcttable
+ * a2 = output
+ * a3 = range_limit
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ addiu sp, sp, -256
+ move v0, sp
+ addiu v1, zero, 8 /* v1 = DCTSIZE = 8 */
+1:
+ lh s4, 32(a0) /* s4 = inptr[16] */
+ lh s5, 64(a0) /* s5 = inptr[32] */
+ lh s6, 96(a0) /* s6 = inptr[48] */
+ lh t1, 112(a0) /* t1 = inptr[56] */
+ lh t7, 16(a0) /* t7 = inptr[8] */
+ lh t5, 80(a0) /* t5 = inptr[40] */
+ lh t3, 48(a0) /* t3 = inptr[24] */
+ or s4, s4, t1
+ or s4, s4, t3
+ or s4, s4, t5
+ or s4, s4, t7
+ or s4, s4, s5
+ or s4, s4, s6
+ bnez s4, 2f
+ addiu v1, v1, -1
+ lh s5, 0(a1) /* quantptr[DCTSIZE*0] */
+ lh s6, 0(a0) /* inptr[DCTSIZE*0] */
+ mul s5, s5, s6 /* DEQUANTIZE(inptr[0], quantptr[0]) */
+ sll s5, s5, 2
+ sw s5, 0(v0)
+ sw s5, 32(v0)
+ sw s5, 64(v0)
+ sw s5, 96(v0)
+ sw s5, 128(v0)
+ sw s5, 160(v0)
+ sw s5, 192(v0)
+ b 3f
+ sw s5, 224(v0)
+2:
+ lh t0, 112(a1)
+ lh t2, 48(a1)
+ lh t4, 80(a1)
+ lh t6, 16(a1)
+ mul t0, t0, t1 /* DEQUANTIZE(inptr[DCTSIZE*7],
+ quantptr[DCTSIZE*7]) */
+ mul t1, t2, t3 /* DEQUANTIZE(inptr[DCTSIZE*3],
+ quantptr[DCTSIZE*3]) */
+ mul t2, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*5],
+ quantptr[DCTSIZE*5]) */
+ mul t3, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*1],
+ quantptr[DCTSIZE*1]) */
+ lh t4, 32(a1)
+ lh t5, 32(a0)
+ lh t6, 96(a1)
+ lh t7, 96(a0)
+ addu s0, t0, t1 /* z3 = tmp0 + tmp2 */
+ addu s1, t1, t2 /* z2 = tmp1 + tmp2 */
+ addu s2, t2, t3 /* z4 = tmp1 + tmp3 */
+ addu s3, s0, s2 /* z3 + z4 */
+ addiu t9, zero, 9633 /* FIX_1_175875602 */
+ mul s3, s3, t9 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ addu t8, t0, t3 /* z1 = tmp0 + tmp3 */
+ addiu t9, zero, 2446 /* FIX_0_298631336 */
+ mul t0, t0, t9 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ addiu t9, zero, 16819 /* FIX_2_053119869 */
+ mul t2, t2, t9 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ addiu t9, zero, 25172 /* FIX_3_072711026 */
+ mul t1, t1, t9 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ addiu t9, zero, 12299 /* FIX_1_501321110 */
+ mul t3, t3, t9 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ addiu t9, zero, 16069 /* FIX_1_961570560 */
+ mul s0, s0, t9 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+ addiu t9, zero, 3196 /* FIX_0_390180644 */
+ mul s2, s2, t9 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+ addiu t9, zero, 7373 /* FIX_0_899976223 */
+ mul t8, t8, t9 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+ addiu t9, zero, 20995 /* FIX_2_562915447 */
+ mul s1, s1, t9 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+ subu s0, s3, s0 /* z3 += z5 */
+ addu t0, t0, s0 /* tmp0 += z3 */
+ addu t1, t1, s0 /* tmp2 += z3 */
+ subu s2, s3, s2 /* z4 += z5 */
+ addu t2, t2, s2 /* tmp1 += z4 */
+ addu t3, t3, s2 /* tmp3 += z4 */
+ subu t0, t0, t8 /* tmp0 += z1 */
+ subu t1, t1, s1 /* tmp2 += z2 */
+ subu t2, t2, s1 /* tmp1 += z2 */
+ subu t3, t3, t8 /* tmp3 += z1 */
+ mul s0, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*2],
+ quantptr[DCTSIZE*2]) */
+ addiu t9, zero, 6270 /* FIX_0_765366865 */
+ mul s1, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*6],
+ quantptr[DCTSIZE*6]) */
+ lh t4, 0(a1)
+ lh t5, 0(a0)
+ lh t6, 64(a1)
+ lh t7, 64(a0)
+ mul s2, t9, s0 /* MULTIPLY(z2, FIX_0_765366865) */
+ mul t5, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*0],
+ quantptr[DCTSIZE*0]) */
+ mul t6, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*4],
+ quantptr[DCTSIZE*4]) */
+ addiu t9, zero, 4433 /* FIX_0_541196100 */
+ addu s3, s0, s1 /* z2 + z3 */
+ mul s3, s3, t9 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+ addiu t9, zero, 15137 /* FIX_1_847759065 */
+ mul t8, s1, t9 /* MULTIPLY(z3, FIX_1_847759065) */
+ addu t4, t5, t6
+ subu t5, t5, t6
+ sll t4, t4, 13 /* tmp0 = (z2 + z3) << CONST_BITS */
+ sll t5, t5, 13 /* tmp1 = (z2 - z3) << CONST_BITS */
+ addu t7, s3, s2 /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */
+ subu t6, s3, t8 /* tmp2 =
+ z1 + MULTIPLY(z3, -FIX_1_847759065) */
+ addu s0, t4, t7
+ subu s1, t4, t7
+ addu s2, t5, t6
+ subu s3, t5, t6
+ addu t4, s0, t3
+ subu s0, s0, t3
+ addu t3, s2, t1
+ subu s2, s2, t1
+ addu t1, s3, t2
+ subu s3, s3, t2
+ addu t2, s1, t0
+ subu s1, s1, t0
+ shra_r.w t4, t4, 11
+ shra_r.w t3, t3, 11
+ shra_r.w t1, t1, 11
+ shra_r.w t2, t2, 11
+ shra_r.w s1, s1, 11
+ shra_r.w s3, s3, 11
+ shra_r.w s2, s2, 11
+ shra_r.w s0, s0, 11
+ sw t4, 0(v0)
+ sw t3, 32(v0)
+ sw t1, 64(v0)
+ sw t2, 96(v0)
+ sw s1, 128(v0)
+ sw s3, 160(v0)
+ sw s2, 192(v0)
+ sw s0, 224(v0)
+3:
+ addiu a1, a1, 2
+ addiu a0, a0, 2
+ bgtz v1, 1b
+ addiu v0, v0, 4
+ move v0, sp
+ addiu v1, zero, 8
+4:
+ lw t0, 8(v0) /* z2 = (JLONG)wsptr[2] */
+ lw t1, 24(v0) /* z3 = (JLONG)wsptr[6] */
+ lw t2, 0(v0) /* (JLONG)wsptr[0] */
+ lw t3, 16(v0) /* (JLONG)wsptr[4] */
+ lw s4, 4(v0) /* (JLONG)wsptr[1] */
+ lw s5, 12(v0) /* (JLONG)wsptr[3] */
+ lw s6, 20(v0) /* (JLONG)wsptr[5] */
+ lw s7, 28(v0) /* (JLONG)wsptr[7] */
+ or s4, s4, t0
+ or s4, s4, t1
+ or s4, s4, t3
+ or s4, s4, s7
+ or s4, s4, s5
+ or s4, s4, s6
+ bnez s4, 5f
+ addiu v1, v1, -1
+ shra_r.w s5, t2, 5
+ andi s5, s5, 0x3ff
+ lbux s5, s5(a3)
+ lw s1, 0(a2)
+ replv.qb s5, s5
+ usw s5, 0(s1)
+ usw s5, 4(s1)
+ b 6f
+ nop
+5:
+ addu t4, t0, t1 /* z2 + z3 */
+ addiu t8, zero, 4433 /* FIX_0_541196100 */
+ mul t5, t4, t8 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+ addiu t8, zero, 15137 /* FIX_1_847759065 */
+ mul t1, t1, t8 /* MULTIPLY(z3, FIX_1_847759065) */
+ addiu t8, zero, 6270 /* FIX_0_765366865 */
+ mul t0, t0, t8 /* MULTIPLY(z2, FIX_0_765366865) */
+ addu t4, t2, t3 /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */
+ subu t2, t2, t3 /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */
+ sll t4, t4, 13 /* tmp0 =
+ (wsptr[0] + wsptr[4]) << CONST_BITS */
+ sll t2, t2, 13 /* tmp1 =
+ (wsptr[0] - wsptr[4]) << CONST_BITS */
+ subu t1, t5, t1 /* tmp2 =
+ z1 + MULTIPLY(z3, -FIX_1_847759065) */
+ subu t3, t2, t1 /* tmp12 = tmp1 - tmp2 */
+ addu t2, t2, t1 /* tmp11 = tmp1 + tmp2 */
+ addu t5, t5, t0 /* tmp3 =
+ z1 + MULTIPLY(z2, FIX_0_765366865) */
+ subu t1, t4, t5 /* tmp13 = tmp0 - tmp3 */
+ addu t0, t4, t5 /* tmp10 = tmp0 + tmp3 */
+ lw t4, 28(v0) /* tmp0 = (JLONG)wsptr[7] */
+ lw t6, 12(v0) /* tmp2 = (JLONG)wsptr[3] */
+ lw t5, 20(v0) /* tmp1 = (JLONG)wsptr[5] */
+ lw t7, 4(v0) /* tmp3 = (JLONG)wsptr[1] */
+ addu s0, t4, t6 /* z3 = tmp0 + tmp2 */
+ addiu t8, zero, 9633 /* FIX_1_175875602 */
+ addu s1, t5, t7 /* z4 = tmp1 + tmp3 */
+ addu s2, s0, s1 /* z3 + z4 */
+ mul s2, s2, t8 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ addu s3, t4, t7 /* z1 = tmp0 + tmp3 */
+ addu t9, t5, t6 /* z2 = tmp1 + tmp2 */
+ addiu t8, zero, 16069 /* FIX_1_961570560 */
+ mul s0, s0, t8 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+ addiu t8, zero, 3196 /* FIX_0_390180644 */
+ mul s1, s1, t8 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+ addiu t8, zero, 2446 /* FIX_0_298631336 */
+ mul t4, t4, t8 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ addiu t8, zero, 7373 /* FIX_0_899976223 */
+ mul s3, s3, t8 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+ addiu t8, zero, 16819 /* FIX_2_053119869 */
+ mul t5, t5, t8 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ addiu t8, zero, 20995 /* FIX_2_562915447 */
+ mul t9, t9, t8 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+ addiu t8, zero, 25172 /* FIX_3_072711026 */
+ mul t6, t6, t8 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ addiu t8, zero, 12299 /* FIX_1_501321110 */
+ mul t7, t7, t8 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ subu s0, s2, s0 /* z3 += z5 */
+ subu s1, s2, s1 /* z4 += z5 */
+ addu t4, t4, s0
+ subu t4, t4, s3 /* tmp0 */
+ addu t5, t5, s1
+ subu t5, t5, t9 /* tmp1 */
+ addu t6, t6, s0
+ subu t6, t6, t9 /* tmp2 */
+ addu t7, t7, s1
+ subu t7, t7, s3 /* tmp3 */
+ addu s0, t0, t7
+ subu t0, t0, t7
+ addu t7, t2, t6
+ subu t2, t2, t6
+ addu t6, t3, t5
+ subu t3, t3, t5
+ addu t5, t1, t4
+ subu t1, t1, t4
+ shra_r.w s0, s0, 18
+ shra_r.w t7, t7, 18
+ shra_r.w t6, t6, 18
+ shra_r.w t5, t5, 18
+ shra_r.w t1, t1, 18
+ shra_r.w t3, t3, 18
+ shra_r.w t2, t2, 18
+ shra_r.w t0, t0, 18
+ andi s0, s0, 0x3ff
+ andi t7, t7, 0x3ff
+ andi t6, t6, 0x3ff
+ andi t5, t5, 0x3ff
+ andi t1, t1, 0x3ff
+ andi t3, t3, 0x3ff
+ andi t2, t2, 0x3ff
+ andi t0, t0, 0x3ff
+ lw s1, 0(a2)
+ lbux s0, s0(a3)
+ lbux t7, t7(a3)
+ lbux t6, t6(a3)
+ lbux t5, t5(a3)
+ lbux t1, t1(a3)
+ lbux t3, t3(a3)
+ lbux t2, t2(a3)
+ lbux t0, t0(a3)
+ sb s0, 0(s1)
+ sb t7, 1(s1)
+ sb t6, 2(s1)
+ sb t5, 3(s1)
+ sb t1, 4(s1)
+ sb t3, 5(s1)
+ sb t2, 6(s1)
+ sb t0, 7(s1)
+6:
+ addiu v0, v0, 32
+ bgtz v1, 4b
+ addiu a2, a2, 4
+ addiu sp, sp, 256
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_idct_islow_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
+/*
+ * a0 = inptr
+ * a1 = quantptr
+ * a2 = wsptr
+ * a3 = mips_idct_ifast_coefs
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ addiu t9, a0, 16 /* end address */
+ or AT, a3, zero
+
+0:
+ lw s0, 0(a1) /* quantptr[DCTSIZE*0] */
+ lw t0, 0(a0) /* inptr[DCTSIZE*0] */
+ lw t1, 16(a0) /* inptr[DCTSIZE*1] */
+ muleq_s.w.phl v0, t0, s0 /* tmp0 ... */
+ lw t2, 32(a0) /* inptr[DCTSIZE*2] */
+ lw t3, 48(a0) /* inptr[DCTSIZE*3] */
+ lw t4, 64(a0) /* inptr[DCTSIZE*4] */
+ lw t5, 80(a0) /* inptr[DCTSIZE*5] */
+ muleq_s.w.phr t0, t0, s0 /* ... tmp0 ... */
+ lw t6, 96(a0) /* inptr[DCTSIZE*6] */
+ lw t7, 112(a0) /* inptr[DCTSIZE*7] */
+ or s4, t1, t2
+ or s5, t3, t4
+ bnez s4, 1f
+ ins t0, v0, 16, 16 /* ... tmp0 */
+ bnez s5, 1f
+ or s6, t5, t6
+ or s6, s6, t7
+ bnez s6, 1f
+ sw t0, 0(a2) /* wsptr[DCTSIZE*0] */
+ sw t0, 16(a2) /* wsptr[DCTSIZE*1] */
+ sw t0, 32(a2) /* wsptr[DCTSIZE*2] */
+ sw t0, 48(a2) /* wsptr[DCTSIZE*3] */
+ sw t0, 64(a2) /* wsptr[DCTSIZE*4] */
+ sw t0, 80(a2) /* wsptr[DCTSIZE*5] */
+ sw t0, 96(a2) /* wsptr[DCTSIZE*6] */
+ sw t0, 112(a2) /* wsptr[DCTSIZE*7] */
+ addiu a0, a0, 4
+ b 2f
+ addiu a1, a1, 4
+
+1:
+ lw s1, 32(a1) /* quantptr[DCTSIZE*2] */
+ lw s2, 64(a1) /* quantptr[DCTSIZE*4] */
+ muleq_s.w.phl v0, t2, s1 /* tmp1 ... */
+ muleq_s.w.phr t2, t2, s1 /* ... tmp1 ... */
+ lw s0, 16(a1) /* quantptr[DCTSIZE*1] */
+ lw s1, 48(a1) /* quantptr[DCTSIZE*3] */
+ lw s3, 96(a1) /* quantptr[DCTSIZE*6] */
+ muleq_s.w.phl v1, t4, s2 /* tmp2 ... */
+ muleq_s.w.phr t4, t4, s2 /* ... tmp2 ... */
+ lw s2, 80(a1) /* quantptr[DCTSIZE*5] */
+ lw t8, 4(AT) /* FIX(1.414213562) */
+ ins t2, v0, 16, 16 /* ... tmp1 */
+ muleq_s.w.phl v0, t6, s3 /* tmp3 ... */
+ muleq_s.w.phr t6, t6, s3 /* ... tmp3 ... */
+ ins t4, v1, 16, 16 /* ... tmp2 */
+ addq.ph s4, t0, t4 /* tmp10 */
+ subq.ph s5, t0, t4 /* tmp11 */
+ ins t6, v0, 16, 16 /* ... tmp3 */
+ subq.ph s6, t2, t6 /* tmp12 ... */
+ addq.ph s7, t2, t6 /* tmp13 */
+ mulq_s.ph s6, s6, t8 /* ... tmp12 ... */
+ addq.ph t0, s4, s7 /* tmp0 */
+ subq.ph t6, s4, s7 /* tmp3 */
+ muleq_s.w.phl v0, t1, s0 /* tmp4 ... */
+ muleq_s.w.phr t1, t1, s0 /* ... tmp4 ... */
+ shll_s.ph s6, s6, 1 /* x2 */
+ lw s3, 112(a1) /* quantptr[DCTSIZE*7] */
+ subq.ph s6, s6, s7 /* ... tmp12 */
+ muleq_s.w.phl v1, t7, s3 /* tmp7 ... */
+ muleq_s.w.phr t7, t7, s3 /* ... tmp7 ... */
+ ins t1, v0, 16, 16 /* ... tmp4 */
+ addq.ph t2, s5, s6 /* tmp1 */
+ subq.ph t4, s5, s6 /* tmp2 */
+ muleq_s.w.phl v0, t5, s2 /* tmp6 ... */
+ muleq_s.w.phr t5, t5, s2 /* ... tmp6 ... */
+ ins t7, v1, 16, 16 /* ... tmp7 */
+ addq.ph s5, t1, t7 /* z11 */
+ subq.ph s6, t1, t7 /* z12 */
+ muleq_s.w.phl v1, t3, s1 /* tmp5 ... */
+ muleq_s.w.phr t3, t3, s1 /* ... tmp5 ... */
+ ins t5, v0, 16, 16 /* ... tmp6 */
+ ins t3, v1, 16, 16 /* ... tmp5 */
+ addq.ph s7, t5, t3 /* z13 */
+ subq.ph v0, t5, t3 /* z10 */
+ addq.ph t7, s5, s7 /* tmp7 */
+ subq.ph s5, s5, s7 /* tmp11 ... */
+ addq.ph v1, v0, s6 /* z5 ... */
+ mulq_s.ph s5, s5, t8 /* ... tmp11 */
+ lw t8, 8(AT) /* FIX(1.847759065) */
+ lw s4, 0(AT) /* FIX(1.082392200) */
+ addq.ph s0, t0, t7
+ subq.ph s1, t0, t7
+ mulq_s.ph v1, v1, t8 /* ... z5 */
+ shll_s.ph s5, s5, 1 /* x2 */
+ lw t8, 12(AT) /* FIX(-2.613125930) */
+ sw s0, 0(a2) /* wsptr[DCTSIZE*0] */
+ shll_s.ph v0, v0, 1 /* x4 */
+ mulq_s.ph v0, v0, t8 /* tmp12 ... */
+ mulq_s.ph s4, s6, s4 /* tmp10 ... */
+ shll_s.ph v1, v1, 1 /* x2 */
+ addiu a0, a0, 4
+ addiu a1, a1, 4
+ sw s1, 112(a2) /* wsptr[DCTSIZE*7] */
+ shll_s.ph s6, v0, 1 /* x4 */
+ shll_s.ph s4, s4, 1 /* x2 */
+ addq.ph s6, s6, v1 /* ... tmp12 */
+ subq.ph t5, s6, t7 /* tmp6 */
+ subq.ph s4, s4, v1 /* ... tmp10 */
+ subq.ph t3, s5, t5 /* tmp5 */
+ addq.ph s2, t2, t5
+ addq.ph t1, s4, t3 /* tmp4 */
+ subq.ph s3, t2, t5
+ sw s2, 16(a2) /* wsptr[DCTSIZE*1] */
+ sw s3, 96(a2) /* wsptr[DCTSIZE*6] */
+ addq.ph v0, t4, t3
+ subq.ph v1, t4, t3
+ sw v0, 32(a2) /* wsptr[DCTSIZE*2] */
+ sw v1, 80(a2) /* wsptr[DCTSIZE*5] */
+ addq.ph v0, t6, t1
+ subq.ph v1, t6, t1
+ sw v0, 64(a2) /* wsptr[DCTSIZE*4] */
+ sw v1, 48(a2) /* wsptr[DCTSIZE*3] */
+
+2:
+ bne a0, t9, 0b
+ addiu a2, a2, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_idct_ifast_cols_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
+/*
+ * a0 = wsptr
+ * a1 = output_buf
+ * a2 = output_col
+ * a3 = mips_idct_ifast_coefs
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+ addiu t9, a0, 128 /* end address */
+ lui s8, 0x8080
+ ori s8, s8, 0x8080
+
+0:
+ lw AT, 36(sp) /* restore $a3 (mips_idct_ifast_coefs) */
+ lw t0, 0(a0) /* wsptr[DCTSIZE*0+0/1] b a */
+ lw s0, 16(a0) /* wsptr[DCTSIZE*1+0/1] B A */
+ lw t2, 4(a0) /* wsptr[DCTSIZE*0+2/3] d c */
+ lw s2, 20(a0) /* wsptr[DCTSIZE*1+2/3] D C */
+ lw t4, 8(a0) /* wsptr[DCTSIZE*0+4/5] f e */
+ lw s4, 24(a0) /* wsptr[DCTSIZE*1+4/5] F E */
+ lw t6, 12(a0) /* wsptr[DCTSIZE*0+6/7] h g */
+ lw s6, 28(a0) /* wsptr[DCTSIZE*1+6/7] H G */
+ precrq.ph.w t1, s0, t0 /* B b */
+ ins t0, s0, 16, 16 /* A a */
+ bnez t1, 1f
+ or s0, t2, s2
+ bnez s0, 1f
+ or s0, t4, s4
+ bnez s0, 1f
+ or s0, t6, s6
+ bnez s0, 1f
+ shll_s.ph s0, t0, 2 /* A a */
+ lw a3, 0(a1)
+ lw AT, 4(a1)
+ precrq.ph.w t0, s0, s0 /* A A */
+ ins s0, s0, 16, 16 /* a a */
+ addu a3, a3, a2
+ addu AT, AT, a2
+ precrq.qb.ph t0, t0, t0 /* A A A A */
+ precrq.qb.ph s0, s0, s0 /* a a a a */
+ addu.qb s0, s0, s8
+ addu.qb t0, t0, s8
+ sw s0, 0(a3)
+ sw s0, 4(a3)
+ sw t0, 0(AT)
+ sw t0, 4(AT)
+ addiu a0, a0, 32
+ bne a0, t9, 0b
+ addiu a1, a1, 8
+ b 2f
+ nop
+
+1:
+ precrq.ph.w t3, s2, t2
+ ins t2, s2, 16, 16
+ precrq.ph.w t5, s4, t4
+ ins t4, s4, 16, 16
+ precrq.ph.w t7, s6, t6
+ ins t6, s6, 16, 16
+ lw t8, 4(AT) /* FIX(1.414213562) */
+ addq.ph s4, t0, t4 /* tmp10 */
+ subq.ph s5, t0, t4 /* tmp11 */
+ subq.ph s6, t2, t6 /* tmp12 ... */
+ addq.ph s7, t2, t6 /* tmp13 */
+ mulq_s.ph s6, s6, t8 /* ... tmp12 ... */
+ addq.ph t0, s4, s7 /* tmp0 */
+ subq.ph t6, s4, s7 /* tmp3 */
+ shll_s.ph s6, s6, 1 /* x2 */
+ subq.ph s6, s6, s7 /* ... tmp12 */
+ addq.ph t2, s5, s6 /* tmp1 */
+ subq.ph t4, s5, s6 /* tmp2 */
+ addq.ph s5, t1, t7 /* z11 */
+ subq.ph s6, t1, t7 /* z12 */
+ addq.ph s7, t5, t3 /* z13 */
+ subq.ph v0, t5, t3 /* z10 */
+ addq.ph t7, s5, s7 /* tmp7 */
+ subq.ph s5, s5, s7 /* tmp11 ... */
+ addq.ph v1, v0, s6 /* z5 ... */
+ mulq_s.ph s5, s5, t8 /* ... tmp11 */
+ lw t8, 8(AT) /* FIX(1.847759065) */
+ lw s4, 0(AT) /* FIX(1.082392200) */
+ addq.ph s0, t0, t7 /* tmp0 + tmp7 */
+ subq.ph s7, t0, t7 /* tmp0 - tmp7 */
+ mulq_s.ph v1, v1, t8 /* ... z5 */
+ lw a3, 0(a1)
+ lw t8, 12(AT) /* FIX(-2.613125930) */
+ shll_s.ph s5, s5, 1 /* x2 */
+ addu a3, a3, a2
+ shll_s.ph v0, v0, 1 /* x4 */
+ mulq_s.ph v0, v0, t8 /* tmp12 ... */
+ mulq_s.ph s4, s6, s4 /* tmp10 ... */
+ shll_s.ph v1, v1, 1 /* x2 */
+ addiu a0, a0, 32
+ addiu a1, a1, 8
+ shll_s.ph s6, v0, 1 /* x4 */
+ shll_s.ph s4, s4, 1 /* x2 */
+ addq.ph s6, s6, v1 /* ... tmp12 */
+ shll_s.ph s0, s0, 2
+ subq.ph t5, s6, t7 /* tmp6 */
+ subq.ph s4, s4, v1 /* ... tmp10 */
+ subq.ph t3, s5, t5 /* tmp5 */
+ shll_s.ph s7, s7, 2
+ addq.ph t1, s4, t3 /* tmp4 */
+ addq.ph s1, t2, t5 /* tmp1 + tmp6 */
+ subq.ph s6, t2, t5 /* tmp1 - tmp6 */
+ addq.ph s2, t4, t3 /* tmp2 + tmp5 */
+ subq.ph s5, t4, t3 /* tmp2 - tmp5 */
+ addq.ph s4, t6, t1 /* tmp3 + tmp4 */
+ subq.ph s3, t6, t1 /* tmp3 - tmp4 */
+ shll_s.ph s1, s1, 2
+ shll_s.ph s2, s2, 2
+ shll_s.ph s3, s3, 2
+ shll_s.ph s4, s4, 2
+ shll_s.ph s5, s5, 2
+ shll_s.ph s6, s6, 2
+ precrq.ph.w t0, s1, s0 /* B A */
+ ins s0, s1, 16, 16 /* b a */
+ precrq.ph.w t2, s3, s2 /* D C */
+ ins s2, s3, 16, 16 /* d c */
+ precrq.ph.w t4, s5, s4 /* F E */
+ ins s4, s5, 16, 16 /* f e */
+ precrq.ph.w t6, s7, s6 /* H G */
+ ins s6, s7, 16, 16 /* h g */
+ precrq.qb.ph t0, t2, t0 /* D C B A */
+ precrq.qb.ph s0, s2, s0 /* d c b a */
+ precrq.qb.ph t4, t6, t4 /* H G F E */
+ precrq.qb.ph s4, s6, s4 /* h g f e */
+ addu.qb s0, s0, s8
+ addu.qb s4, s4, s8
+ sw s0, 0(a3) /* outptr[0/1/2/3] d c b a */
+ sw s4, 4(a3) /* outptr[4/5/6/7] h g f e */
+ lw a3, -4(a1)
+ addu.qb t0, t0, s8
+ addu a3, a3, a2
+ addu.qb t4, t4, s8
+ sw t0, 0(a3) /* outptr[0/1/2/3] D C B A */
+ bne a0, t9, 0b
+ sw t4, 4(a3) /* outptr[4/5/6/7] H G F E */
+
+2:
+
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+ j ra
+ nop
+
+END(jsimd_idct_ifast_rows_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_fdct_islow_dspr2)
+/*
+ * a0 = data
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+ lui t0, 6437
+ ori t0, 2260
+ lui t1, 9633
+ ori t1, 11363
+ lui t2, 0xd39e
+ ori t2, 0xe6dc
+ lui t3, 0xf72d
+ ori t3, 9633
+ lui t4, 2261
+ ori t4, 9633
+ lui t5, 0xd39e
+ ori t5, 6437
+ lui t6, 9633
+ ori t6, 0xd39d
+ lui t7, 0xe6dc
+ ori t7, 2260
+ lui t8, 4433
+ ori t8, 10703
+ lui t9, 0xd630
+ ori t9, 4433
+ li s8, 8
+ move a1, a0
+1:
+ lw s0, 0(a1) /* tmp0 = 1|0 */
+ lw s1, 4(a1) /* tmp1 = 3|2 */
+ lw s2, 8(a1) /* tmp2 = 5|4 */
+ lw s3, 12(a1) /* tmp3 = 7|6 */
+ packrl.ph s1, s1, s1 /* tmp1 = 2|3 */
+ packrl.ph s3, s3, s3 /* tmp3 = 6|7 */
+ subq.ph s7, s1, s2 /* tmp7 = 2-5|3-4 = t5|t4 */
+ subq.ph s5, s0, s3 /* tmp5 = 1-6|0-7 = t6|t7 */
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, s7, t0 /* ac0 += t5* 6437 + t4* 2260 */
+ dpa.w.ph $ac0, s5, t1 /* ac0 += t6* 9633 + t7* 11363 */
+ mult $ac1, $0, $0 /* ac1 = 0 */
+ dpa.w.ph $ac1, s7, t2 /* ac1 += t5*-11362 + t4* -6436 */
+ dpa.w.ph $ac1, s5, t3 /* ac1 += t6* -2259 + t7* 9633 */
+ mult $ac2, $0, $0 /* ac2 = 0 */
+ dpa.w.ph $ac2, s7, t4 /* ac2 += t5* 2261 + t4* 9633 */
+ dpa.w.ph $ac2, s5, t5 /* ac2 += t6*-11362 + t7* 6437 */
+ mult $ac3, $0, $0 /* ac3 = 0 */
+ dpa.w.ph $ac3, s7, t6 /* ac3 += t5* 9633 + t4*-11363 */
+ dpa.w.ph $ac3, s5, t7 /* ac3 += t6* -6436 + t7* 2260 */
+ addq.ph s6, s1, s2 /* tmp6 = 2+5|3+4 = t2|t3 */
+ addq.ph s4, s0, s3 /* tmp4 = 1+6|0+7 = t1|t0 */
+ extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */
+ extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */
+ extr_r.w s2, $ac2, 11 /* tmp2 = (ac2 + 1024) >> 11 */
+ extr_r.w s3, $ac3, 11 /* tmp3 = (ac3 + 1024) >> 11 */
+ addq.ph s5, s4, s6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+ subq.ph s7, s4, s6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+ sh s0, 2(a1)
+ sh s1, 6(a1)
+ sh s2, 10(a1)
+ sh s3, 14(a1)
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, s7, t8 /* ac0 += t12* 4433 + t13* 10703 */
+ mult $ac1, $0, $0 /* ac1 = 0 */
+ dpa.w.ph $ac1, s7, t9 /* ac1 += t12*-10704 + t13* 4433 */
+ sra s4, s5, 16 /* tmp4 = t11 */
+ addiu a1, a1, 16
+ addiu s8, s8, -1
+ extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */
+ extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */
+ addu s2, s5, s4 /* tmp2 = t10 + t11 */
+ subu s3, s5, s4 /* tmp3 = t10 - t11 */
+ sll s2, s2, 2 /* tmp2 = (t10 + t11) << 2 */
+ sll s3, s3, 2 /* tmp3 = (t10 - t11) << 2 */
+ sh s2, -16(a1)
+ sh s3, -8(a1)
+ sh s0, -12(a1)
+ bgtz s8, 1b
+ sh s1, -4(a1)
+ li t0, 2260
+ li t1, 11363
+ li t2, 9633
+ li t3, 6436
+ li t4, 6437
+ li t5, 2261
+ li t6, 11362
+ li t7, 2259
+ li t8, 4433
+ li t9, 10703
+ li a1, 10704
+ li s8, 8
+
+2:
+ lh a2, 0(a0) /* 0 */
+ lh a3, 16(a0) /* 8 */
+ lh v0, 32(a0) /* 16 */
+ lh v1, 48(a0) /* 24 */
+ lh s4, 64(a0) /* 32 */
+ lh s5, 80(a0) /* 40 */
+ lh s6, 96(a0) /* 48 */
+ lh s7, 112(a0) /* 56 */
+ addu s2, v0, s5 /* tmp2 = 16 + 40 */
+ subu s5, v0, s5 /* tmp5 = 16 - 40 */
+ addu s3, v1, s4 /* tmp3 = 24 + 32 */
+ subu s4, v1, s4 /* tmp4 = 24 - 32 */
+ addu s0, a2, s7 /* tmp0 = 0 + 56 */
+ subu s7, a2, s7 /* tmp7 = 0 - 56 */
+ addu s1, a3, s6 /* tmp1 = 8 + 48 */
+ subu s6, a3, s6 /* tmp6 = 8 - 48 */
+ addu a2, s0, s3 /* tmp10 = tmp0 + tmp3 */
+ subu v1, s0, s3 /* tmp13 = tmp0 - tmp3 */
+ addu a3, s1, s2 /* tmp11 = tmp1 + tmp2 */
+ subu v0, s1, s2 /* tmp12 = tmp1 - tmp2 */
+ mult s7, t1 /* ac0 = tmp7 * c1 */
+ madd s4, t0 /* ac0 += tmp4 * c0 */
+ madd s5, t4 /* ac0 += tmp5 * c4 */
+ madd s6, t2 /* ac0 += tmp6 * c2 */
+ mult $ac1, s7, t2 /* ac1 = tmp7 * c2 */
+ msub $ac1, s4, t3 /* ac1 -= tmp4 * c3 */
+ msub $ac1, s5, t6 /* ac1 -= tmp5 * c6 */
+ msub $ac1, s6, t7 /* ac1 -= tmp6 * c7 */
+ mult $ac2, s7, t4 /* ac2 = tmp7 * c4 */
+ madd $ac2, s4, t2 /* ac2 += tmp4 * c2 */
+ madd $ac2, s5, t5 /* ac2 += tmp5 * c5 */
+ msub $ac2, s6, t6 /* ac2 -= tmp6 * c6 */
+ mult $ac3, s7, t0 /* ac3 = tmp7 * c0 */
+ msub $ac3, s4, t1 /* ac3 -= tmp4 * c1 */
+ madd $ac3, s5, t2 /* ac3 += tmp5 * c2 */
+ msub $ac3, s6, t3 /* ac3 -= tmp6 * c3 */
+ extr_r.w s0, $ac0, 15 /* tmp0 = (ac0 + 16384) >> 15 */
+ extr_r.w s1, $ac1, 15 /* tmp1 = (ac1 + 16384) >> 15 */
+ extr_r.w s2, $ac2, 15 /* tmp2 = (ac2 + 16384) >> 15 */
+ extr_r.w s3, $ac3, 15 /* tmp3 = (ac3 + 16384) >> 15 */
+ addiu s8, s8, -1
+ addu s4, a2, a3 /* tmp4 = tmp10 + tmp11 */
+ subu s5, a2, a3 /* tmp5 = tmp10 - tmp11 */
+ sh s0, 16(a0)
+ sh s1, 48(a0)
+ sh s2, 80(a0)
+ sh s3, 112(a0)
+ mult v0, t8 /* ac0 = tmp12 * c8 */
+ madd v1, t9 /* ac0 += tmp13 * c9 */
+ mult $ac1, v1, t8 /* ac1 = tmp13 * c8 */
+ msub $ac1, v0, a1 /* ac1 -= tmp12 * c10 */
+ addiu a0, a0, 2
+ extr_r.w s6, $ac0, 15 /* tmp6 = (ac0 + 16384) >> 15 */
+ extr_r.w s7, $ac1, 15 /* tmp7 = (ac1 + 16384) >> 15 */
+ shra_r.w s4, s4, 2 /* tmp4 = (tmp4 + 2) >> 2 */
+ shra_r.w s5, s5, 2 /* tmp5 = (tmp5 + 2) >> 2 */
+ sh s4, -2(a0)
+ sh s5, 62(a0)
+ sh s6, 30(a0)
+ bgtz s8, 2b
+ sh s7, 94(a0)
+
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+ jr ra
+ nop
+
+END(jsimd_fdct_islow_dspr2)
+
+
+/**************************************************************************/
+LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
+/*
+ * a0 = data
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 8, s0, s1
+
+ li a1, 0x014e014e /* FIX_1_306562965 (334 << 16) |
+ (334 & 0xffff) */
+ li a2, 0x008b008b /* FIX_0_541196100 (139 << 16) |
+ (139 & 0xffff) */
+ li a3, 0x00620062 /* FIX_0_382683433 (98 << 16) |
+ (98 & 0xffff) */
+ li s1, 0x00b500b5 /* FIX_0_707106781 (181 << 16) |
+ (181 & 0xffff) */
+
+ move v0, a0
+ addiu v1, v0, 128 /* end address */
+
+0:
+ lw t0, 0(v0) /* tmp0 = 1|0 */
+ lw t1, 4(v0) /* tmp1 = 3|2 */
+ lw t2, 8(v0) /* tmp2 = 5|4 */
+ lw t3, 12(v0) /* tmp3 = 7|6 */
+ packrl.ph t1, t1, t1 /* tmp1 = 2|3 */
+ packrl.ph t3, t3, t3 /* tmp3 = 6|7 */
+ subq.ph t7, t1, t2 /* tmp7 = 2-5|3-4 = t5|t4 */
+ subq.ph t5, t0, t3 /* tmp5 = 1-6|0-7 = t6|t7 */
+ addq.ph t6, t1, t2 /* tmp6 = 2+5|3+4 = t2|t3 */
+ addq.ph t4, t0, t3 /* tmp4 = 1+6|0+7 = t1|t0 */
+ addq.ph t8, t4, t6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+ subq.ph t9, t4, t6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+ sra t4, t8, 16 /* tmp4 = t11 */
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, t9, s1
+ mult $ac1, $0, $0 /* ac1 = 0 */
+ dpa.w.ph $ac1, t7, a3 /* ac1 += t4*98 + t5*98 */
+ dpsx.w.ph $ac1, t5, a3 /* ac1 += t6*98 + t7*98 */
+ mult $ac2, $0, $0 /* ac2 = 0 */
+ dpa.w.ph $ac2, t7, a2 /* ac2 += t4*139 + t5*139 */
+ mult $ac3, $0, $0 /* ac3 = 0 */
+ dpa.w.ph $ac3, t5, a1 /* ac3 += t6*334 + t7*334 */
+ precrq.ph.w t0, t5, t7 /* t0 = t5|t6 */
+ addq.ph t2, t8, t4 /* tmp2 = t10 + t11 */
+ subq.ph t3, t8, t4 /* tmp3 = t10 - t11 */
+ extr.w t4, $ac0, 8
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, t0, s1 /* ac0 += t5*181 + t6*181 */
+ extr.w t0, $ac1, 8 /* t0 = z5 */
+ extr.w t1, $ac2, 8 /* t1 = MULTIPLY(tmp10, 139) */
+ extr.w t7, $ac3, 8 /* t2 = MULTIPLY(tmp12, 334) */
+ extr.w t8, $ac0, 8 /* t8 = z3 = MULTIPLY(tmp11, 181) */
+ add t6, t1, t0 /* t6 = z2 */
+ add t7, t7, t0 /* t7 = z4 */
+ subq.ph t0, t5, t8 /* t0 = z13 = tmp7 - z3 */
+ addq.ph t8, t5, t8 /* t9 = z11 = tmp7 + z3 */
+ addq.ph t1, t0, t6 /* t1 = z13 + z2 */
+ subq.ph t6, t0, t6 /* t6 = z13 - z2 */
+ addq.ph t0, t8, t7 /* t0 = z11 + z4 */
+ subq.ph t7, t8, t7 /* t7 = z11 - z4 */
+ addq.ph t5, t4, t9
+ subq.ph t4, t9, t4
+ sh t2, 0(v0)
+ sh t5, 4(v0)
+ sh t3, 8(v0)
+ sh t4, 12(v0)
+ sh t1, 10(v0)
+ sh t6, 6(v0)
+ sh t0, 2(v0)
+ sh t7, 14(v0)
+ addiu v0, 16
+ bne v1, v0, 0b
+ nop
+ move v0, a0
+ addiu v1, v0, 16
+
+1:
+ lh t0, 0(v0) /* 0 */
+ lh t1, 16(v0) /* 8 */
+ lh t2, 32(v0) /* 16 */
+ lh t3, 48(v0) /* 24 */
+ lh t4, 64(v0) /* 32 */
+ lh t5, 80(v0) /* 40 */
+ lh t6, 96(v0) /* 48 */
+ lh t7, 112(v0) /* 56 */
+ add t8, t0, t7 /* t8 = tmp0 */
+ sub t7, t0, t7 /* t7 = tmp7 */
+ add t0, t1, t6 /* t0 = tmp1 */
+ sub t1, t1, t6 /* t1 = tmp6 */
+ add t6, t2, t5 /* t6 = tmp2 */
+ sub t5, t2, t5 /* t5 = tmp5 */
+ add t2, t3, t4 /* t2 = tmp3 */
+ sub t3, t3, t4 /* t3 = tmp4 */
+ add t4, t8, t2 /* t4 = tmp10 = tmp0 + tmp3 */
+ sub t8, t8, t2 /* t8 = tmp13 = tmp0 - tmp3 */
+ sub s0, t0, t6 /* s0 = tmp12 = tmp1 - tmp2 */
+ ins t8, s0, 16, 16 /* t8 = tmp12|tmp13 */
+ add t2, t0, t6 /* t2 = tmp11 = tmp1 + tmp2 */
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, t8, s1 /* ac0 += t12*181 + t13*181 */
+ add s0, t4, t2 /* t8 = tmp10+tmp11 */
+ sub t4, t4, t2 /* t4 = tmp10-tmp11 */
+ sh s0, 0(v0)
+ sh t4, 64(v0)
+ extr.w t2, $ac0, 8 /* z1 = MULTIPLY(tmp12+tmp13,
+ FIX_0_707106781) */
+ addq.ph t4, t8, t2 /* t9 = tmp13 + z1 */
+ subq.ph t8, t8, t2 /* t2 = tmp13 - z1 */
+ sh t4, 32(v0)
+ sh t8, 96(v0)
+ add t3, t3, t5 /* t3 = tmp10 = tmp4 + tmp5 */
+ add t0, t5, t1 /* t0 = tmp11 = tmp5 + tmp6 */
+ add t1, t1, t7 /* t1 = tmp12 = tmp6 + tmp7 */
+ andi t4, a1, 0xffff
+ mul s0, t1, t4
+ sra s0, s0, 8 /* s0 = z4 =
+ MULTIPLY(tmp12, FIX_1_306562965) */
+ ins t1, t3, 16, 16 /* t1 = tmp10|tmp12 */
+ mult $0, $0 /* ac0 = 0 */
+ mulsa.w.ph $ac0, t1, a3 /* ac0 += t10*98 - t12*98 */
+ extr.w t8, $ac0, 8 /* z5 = MULTIPLY(tmp10-tmp12,
+ FIX_0_382683433) */
+ add t2, t7, t8 /* t2 = tmp7 + z5 */
+ sub t7, t7, t8 /* t7 = tmp7 - z5 */
+ andi t4, a2, 0xffff
+ mul t8, t3, t4
+ sra t8, t8, 8 /* t8 = z2 =
+ MULTIPLY(tmp10, FIX_0_541196100) */
+ andi t4, s1, 0xffff
+ mul t6, t0, t4
+ sra t6, t6, 8 /* t6 = z3 =
+ MULTIPLY(tmp11, FIX_0_707106781) */
+ add t0, t6, t8 /* t0 = z3 + z2 */
+ sub t1, t6, t8 /* t1 = z3 - z2 */
+ add t3, t6, s0 /* t3 = z3 + z4 */
+ sub t4, t6, s0 /* t4 = z3 - z4 */
+ sub t5, t2, t1 /* t5 = dataptr[5] */
+ sub t6, t7, t0 /* t6 = dataptr[3] */
+ add t3, t2, t3 /* t3 = dataptr[1] */
+ add t4, t7, t4 /* t4 = dataptr[7] */
+ sh t5, 80(v0)
+ sh t6, 48(v0)
+ sh t3, 16(v0)
+ sh t4, 112(v0)
+ addiu v0, 2
+ bne v0, v1, 1b
+ nop
+
+ RESTORE_REGS_FROM_STACK 8, s0, s1
+
+ j ra
+ nop
+END(jsimd_fdct_ifast_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+ addiu v0, a2, 124 /* v0 = workspace_end */
+ lh t0, 0(a2)
+ lh t1, 0(a1)
+ lh t2, 128(a1)
+ sra t3, t0, 15
+ sll t3, t3, 1
+ addiu t3, t3, 1
+ mul t0, t0, t3
+ lh t4, 384(a1)
+ lh t5, 130(a1)
+ lh t6, 2(a2)
+ lh t7, 2(a1)
+ lh t8, 386(a1)
+
+1:
+ andi t1, 0xffff
+ add t9, t0, t2
+ andi t9, 0xffff
+ mul v1, t9, t1
+ sra s0, t6, 15
+ sll s0, s0, 1
+ addiu s0, s0, 1
+ addiu t9, t4, 16
+ srav v1, v1, t9
+ mul v1, v1, t3
+ mul t6, t6, s0
+ andi t7, 0xffff
+ addiu a2, a2, 4
+ addiu a1, a1, 4
+ add s1, t6, t5
+ andi s1, 0xffff
+ sh v1, 0(a0)
+
+ mul s2, s1, t7
+ addiu s1, t8, 16
+ srav s2, s2, s1
+ mul s2, s2, s0
+ lh t0, 0(a2)
+ lh t1, 0(a1)
+ sra t3, t0, 15
+ sll t3, t3, 1
+ addiu t3, t3, 1
+ mul t0, t0, t3
+ lh t2, 128(a1)
+ lh t4, 384(a1)
+ lh t5, 130(a1)
+ lh t8, 386(a1)
+ lh t6, 2(a2)
+ lh t7, 2(a1)
+ sh s2, 2(a0)
+ lh t0, 0(a2)
+ sra t3, t0, 15
+ sll t3, t3, 1
+ addiu t3, t3, 1
+ mul t0, t0, t3
+ bne a2, v0, 1b
+ addiu a0, a0, 4
+
+ andi t1, 0xffff
+ add t9, t0, t2
+ andi t9, 0xffff
+ mul v1, t9, t1
+ sra s0, t6, 15
+ sll s0, s0, 1
+ addiu s0, s0, 1
+ addiu t9, t4, 16
+ srav v1, v1, t9
+ mul v1, v1, t3
+ mul t6, t6, s0
+ andi t7, 0xffff
+ sh v1, 0(a0)
+ add s1, t6, t5
+ andi s1, 0xffff
+ mul s2, s1, t7
+ addiu s1, t8, 16
+ addiu a2, a2, 4
+ addiu a1, a1, 4
+ srav s2, s2, s1
+ mul s2, s2, s0
+ sh s2, 2(a0)
+
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+ j ra
+ nop
+
+END(jsimd_quantize_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_float_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+ .set at
+
+ li t1, 0x46800100 /* integer representation 16384.5 */
+ mtc1 t1, f0
+ li t0, 63
+0:
+ lwc1 f2, 0(a2)
+ lwc1 f10, 0(a1)
+ lwc1 f4, 4(a2)
+ lwc1 f12, 4(a1)
+ lwc1 f6, 8(a2)
+ lwc1 f14, 8(a1)
+ lwc1 f8, 12(a2)
+ lwc1 f16, 12(a1)
+ madd.s f2, f0, f2, f10
+ madd.s f4, f0, f4, f12
+ madd.s f6, f0, f6, f14
+ madd.s f8, f0, f8, f16
+ lwc1 f10, 16(a1)
+ lwc1 f12, 20(a1)
+ trunc.w.s f2, f2
+ trunc.w.s f4, f4
+ trunc.w.s f6, f6
+ trunc.w.s f8, f8
+ lwc1 f14, 24(a1)
+ lwc1 f16, 28(a1)
+ mfc1 t1, f2
+ mfc1 t2, f4
+ mfc1 t3, f6
+ mfc1 t4, f8
+ lwc1 f2, 16(a2)
+ lwc1 f4, 20(a2)
+ lwc1 f6, 24(a2)
+ lwc1 f8, 28(a2)
+ madd.s f2, f0, f2, f10
+ madd.s f4, f0, f4, f12
+ madd.s f6, f0, f6, f14
+ madd.s f8, f0, f8, f16
+ addiu t1, t1, -16384
+ addiu t2, t2, -16384
+ addiu t3, t3, -16384
+ addiu t4, t4, -16384
+ trunc.w.s f2, f2
+ trunc.w.s f4, f4
+ trunc.w.s f6, f6
+ trunc.w.s f8, f8
+ sh t1, 0(a0)
+ sh t2, 2(a0)
+ sh t3, 4(a0)
+ sh t4, 6(a0)
+ mfc1 t1, f2
+ mfc1 t2, f4
+ mfc1 t3, f6
+ mfc1 t4, f8
+ addiu t0, t0, -8
+ addiu a2, a2, 32
+ addiu a1, a1, 32
+ addiu t1, t1, -16384
+ addiu t2, t2, -16384
+ addiu t3, t3, -16384
+ addiu t4, t4, -16384
+ sh t1, 8(a0)
+ sh t2, 10(a0)
+ sh t3, 12(a0)
+ sh t4, 14(a0)
+ bgez t0, 0b
+ addiu a0, a0, 16
+
+ j ra
+ nop
+
+END(jsimd_quantize_float_dspr2)
+
+#endif
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_2x2_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+ addiu sp, sp, -40
+ move v0, sp
+ addiu s2, zero, 29692
+ addiu s3, zero, -10426
+ addiu s4, zero, 6967
+ addiu s5, zero, -5906
+ lh t0, 0(a1) /* t0 = inptr[DCTSIZE*0] */
+ lh t5, 0(a0) /* t5 = quantptr[DCTSIZE*0] */
+ lh t1, 48(a1) /* t1 = inptr[DCTSIZE*3] */
+ lh t6, 48(a0) /* t6 = quantptr[DCTSIZE*3] */
+ mul t4, t5, t0
+ lh t0, 16(a1) /* t0 = inptr[DCTSIZE*1] */
+ lh t5, 16(a0) /* t5 = quantptr[DCTSIZE*1] */
+ mul t6, t6, t1
+ mul t5, t5, t0
+ lh t2, 80(a1) /* t2 = inptr[DCTSIZE*5] */
+ lh t7, 80(a0) /* t7 = quantptr[DCTSIZE*5] */
+ lh t3, 112(a1) /* t3 = inptr[DCTSIZE*7] */
+ lh t8, 112(a0) /* t8 = quantptr[DCTSIZE*7] */
+ mul t7, t7, t2
+ mult zero, zero
+ mul t8, t8, t3
+ li s0, 0x73FCD746 /* s0 = (29692 << 16) | (-10426 & 0xffff) */
+ li s1, 0x1B37E8EE /* s1 = (6967 << 16) | (-5906 & 0xffff) */
+ ins t6, t5, 16, 16 /* t6 = t5|t6 */
+ sll t4, t4, 15
+ dpa.w.ph $ac0, t6, s0
+ lh t1, 2(a1)
+ lh t6, 2(a0)
+ ins t8, t7, 16, 16 /* t8 = t7|t8 */
+ dpa.w.ph $ac0, t8, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 18(a1)
+ lh t6, 18(a0)
+ lh t2, 50(a1)
+ lh t7, 50(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 82(a1)
+ lh t2, 82(a0)
+ lh t3, 114(a1)
+ lh t4, 114(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 0(v0)
+ sw t8, 20(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ lh t1, 6(a1)
+ lh t6, 6(a0)
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 22(a1)
+ lh t6, 22(a0)
+ lh t2, 54(a1)
+ lh t7, 54(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 86(a1)
+ lh t2, 86(a0)
+ lh t3, 118(a1)
+ lh t4, 118(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 4(v0)
+ sw t8, 24(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ lh t1, 10(a1)
+ lh t6, 10(a0)
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 26(a1)
+ lh t6, 26(a0)
+ lh t2, 58(a1)
+ lh t7, 58(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 90(a1)
+ lh t2, 90(a0)
+ lh t3, 122(a1)
+ lh t4, 122(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 8(v0)
+ sw t8, 28(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ lh t1, 14(a1)
+ lh t6, 14(a0)
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 30(a1)
+ lh t6, 30(a0)
+ lh t2, 62(a1)
+ lh t7, 62(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 94(a1)
+ lh t2, 94(a0)
+ lh t3, 126(a1)
+ lh t4, 126(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 12(v0)
+ sw t8, 32(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ lw t9, 0(a2)
+ lw t3, 0(v0)
+ lw t7, 4(v0)
+ lw t1, 8(v0)
+ addu t9, t9, a3
+ sll t3, t3, 15
+ subu t8, t4, t0
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ shra_r.w t8, t8, 13
+ sw t0, 16(v0)
+ sw t8, 36(v0)
+ lw t5, 12(v0)
+ lw t6, 16(v0)
+ mult t7, s2
+ madd t1, s3
+ madd t5, s4
+ madd t6, s5
+ lw t5, 24(v0)
+ lw t7, 28(v0)
+ mflo t0, $ac0
+ lw t8, 32(v0)
+ lw t2, 36(v0)
+ mult $ac1, t5, s2
+ madd $ac1, t7, s3
+ madd $ac1, t8, s4
+ madd $ac1, t2, s5
+ addu t1, t3, t0
+ subu t6, t3, t0
+ shra_r.w t1, t1, 20
+ shra_r.w t6, t6, 20
+ mflo t4, $ac1
+ shll_s.w t1, t1, 24
+ shll_s.w t6, t6, 24
+ sra t1, t1, 24
+ sra t6, t6, 24
+ addiu t1, t1, 128
+ addiu t6, t6, 128
+ lw t0, 20(v0)
+ sb t1, 0(t9)
+ sb t6, 1(t9)
+ sll t0, t0, 15
+ lw t9, 4(a2)
+ addu t1, t0, t4
+ subu t6, t0, t4
+ addu t9, t9, a3
+ shra_r.w t1, t1, 20
+ shra_r.w t6, t6, 20
+ shll_s.w t1, t1, 24
+ shll_s.w t6, t6, 24
+ sra t1, t1, 24
+ sra t6, t6, 24
+ addiu t1, t1, 128
+ addiu t6, t6, 128
+ sb t1, 0(t9)
+ sb t6, 1(t9)
+ addiu sp, sp, 40
+
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+ j ra
+ nop
+
+END(jsimd_idct_2x2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_4x4_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ * 16(sp) = workspace[DCTSIZE*4] (buffers data between passes)
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw v1, 48(sp)
+ move t0, a1
+ move t1, v1
+ li t9, 4
+ li s0, 0x2e75f93e
+ li s1, 0x21f9ba79
+ li s2, 0xecc2efb0
+ li s3, 0x52031ccd
+
+0:
+ lh s6, 32(t0) /* inptr[DCTSIZE*2] */
+ lh t6, 32(a0) /* quantptr[DCTSIZE*2] */
+ lh s7, 96(t0) /* inptr[DCTSIZE*6] */
+ lh t7, 96(a0) /* quantptr[DCTSIZE*6] */
+ mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
+ quantptr[DCTSIZE*2]) */
+ lh s4, 0(t0) /* inptr[DCTSIZE*0] */
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
+ quantptr[DCTSIZE*6]) */
+ lh s5, 0(a0) /* quantptr[0] */
+ li s6, 15137
+ li s7, 6270
+ mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */
+ mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
+ quantptr[DCTSIZE*2]) */
+ lh t5, 112(t0) /* inptr[DCTSIZE*7] */
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
+ quantptr[DCTSIZE*6]) */
+ lh s4, 112(a0) /* quantptr[DCTSIZE*7] */
+ lh v0, 80(t0) /* inptr[DCTSIZE*5] */
+ lh s5, 80(a0) /* quantptr[DCTSIZE*5] */
+ lh s6, 48(a0) /* quantptr[DCTSIZE*3] */
+ sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */
+ lh s7, 16(a0) /* quantptr[DCTSIZE*1] */
+ lh t8, 16(t0) /* inptr[DCTSIZE*1] */
+ subu t6, t6, t7 /* tmp2 =
+ MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+ lh t7, 48(t0) /* inptr[DCTSIZE*3] */
+ mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] *
+ quantptr[DCTSIZE*7]) */
+ mul v0, s5, v0 /* z2 = (inptr[DCTSIZE*5] *
+ quantptr[DCTSIZE*5]) */
+ mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] *
+ quantptr[DCTSIZE*3]) */
+ mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] *
+ quantptr[DCTSIZE*1]) */
+ addu t3, t2, t6 /* tmp10 = tmp0 + z2 */
+ subu t4, t2, t6 /* tmp10 = tmp0 - z2 */
+ mult $ac0, zero, zero
+ mult $ac1, zero, zero
+ ins t5, v0, 16, 16
+ ins t7, t8, 16, 16
+ addiu t9, t9, -1
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ mflo s4, $ac0
+ mflo s5, $ac1
+ addiu a0, a0, 2
+ addiu t1, t1, 4
+ addiu t0, t0, 2
+ addu t6, t4, s4
+ subu t5, t4, s4
+ addu s6, t3, s5
+ subu s7, t3, s5
+ shra_r.w t6, t6, 12 /* DESCALE(tmp12 + temp1, 12) */
+ shra_r.w t5, t5, 12 /* DESCALE(tmp12 - temp1, 12) */
+ shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */
+ shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */
+ sw t6, 28(t1)
+ sw t5, 60(t1)
+ sw s6, -4(t1)
+ bgtz t9, 0b
+ sw s7, 92(t1)
+ /* second loop three pass */
+ li t9, 3
+1:
+ lh s6, 34(t0) /* inptr[DCTSIZE*2] */
+ lh t6, 34(a0) /* quantptr[DCTSIZE*2] */
+ lh s7, 98(t0) /* inptr[DCTSIZE*6] */
+ lh t7, 98(a0) /* quantptr[DCTSIZE*6] */
+ mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
+ quantptr[DCTSIZE*2]) */
+ lh s4, 2(t0) /* inptr[DCTSIZE*0] */
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
+ quantptr[DCTSIZE*6]) */
+ lh s5, 2(a0) /* quantptr[DCTSIZE*0] */
+ li s6, 15137
+ li s7, 6270
+ mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */
+ mul v0, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
+ quantptr[DCTSIZE*2]) */
+ lh t5, 114(t0) /* inptr[DCTSIZE*7] */
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
+ quantptr[DCTSIZE*6]) */
+ lh s4, 114(a0) /* quantptr[DCTSIZE*7] */
+ lh s5, 82(a0) /* quantptr[DCTSIZE*5] */
+ lh t6, 82(t0) /* inptr[DCTSIZE*5] */
+ sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */
+ lh s6, 50(a0) /* quantptr[DCTSIZE*3] */
+ lh t8, 18(t0) /* inptr[DCTSIZE*1] */
+ subu v0, v0, t7 /* tmp2 =
+ MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+ lh t7, 50(t0) /* inptr[DCTSIZE*3] */
+ lh s7, 18(a0) /* quantptr[DCTSIZE*1] */
+ mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] *
+ quantptr[DCTSIZE*7]) */
+ mul t6, s5, t6 /* z2 = (inptr[DCTSIZE*5] *
+ quantptr[DCTSIZE*5]) */
+ mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] *
+ quantptr[DCTSIZE*3]) */
+ mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] *
+ quantptr[DCTSIZE*1]) */
+ addu t3, t2, v0 /* tmp10 = tmp0 + z2 */
+ subu t4, t2, v0 /* tmp10 = tmp0 - z2 */
+ mult $ac0, zero, zero
+ mult $ac1, zero, zero
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ mflo t5, $ac0
+ mflo t6, $ac1
+ addiu t9, t9, -1
+ addiu t0, t0, 2
+ addiu a0, a0, 2
+ addiu t1, t1, 4
+ addu s5, t4, t5
+ subu s4, t4, t5
+ addu s6, t3, t6
+ subu s7, t3, t6
+ shra_r.w s5, s5, 12 /* DESCALE(tmp12 + temp1, 12) */
+ shra_r.w s4, s4, 12 /* DESCALE(tmp12 - temp1, 12) */
+ shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */
+ shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */
+ sw s5, 32(t1)
+ sw s4, 64(t1)
+ sw s6, 0(t1)
+ bgtz t9, 1b
+ sw s7, 96(t1)
+ move t1, v1
+ li s4, 15137
+ lw s6, 8(t1) /* wsptr[2] */
+ li s5, 6270
+ lw s7, 24(t1) /* wsptr[6] */
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
+ FIX_1_847759065) */
+ lw t2, 0(t1) /* wsptr[0] */
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
+ -FIX_0_765366865) */
+ lh t5, 28(t1) /* wsptr[7] */
+ lh t6, 20(t1) /* wsptr[5] */
+ lh t7, 12(t1) /* wsptr[3] */
+ lh t8, 4(t1) /* wsptr[1] */
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 /* tmp0 =
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+ mflo s6, $ac0
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
+ subu s4, s4, s5
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
+ mflo s7, $ac1
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
+ sll s4, t9, 2
+ lw v0, 0(a2) /* output_buf[ctr] */
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+ /* 2 */
+ li s4, 15137
+ lw s6, 40(t1) /* wsptr[2] */
+ li s5, 6270
+ lw s7, 56(t1) /* wsptr[6] */
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
+ FIX_1_847759065) */
+ lw t2, 32(t1) /* wsptr[0] */
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
+ -FIX_0_765366865) */
+ lh t5, 60(t1) /* wsptr[7] */
+ lh t6, 52(t1) /* wsptr[5] */
+ lh t7, 44(t1) /* wsptr[3] */
+ lh t8, 36(t1) /* wsptr[1] */
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 /* tmp0 =
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+ mflo s6, $ac0
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
+ subu s4, s4, s5
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
+ mflo s7, $ac1
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2,
+ CONST_BITS-PASS1_BITS+1) */
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2,
+ CONST_BITS-PASS1_BITS+1) */
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1,
+ CONST_BITS-PASS1_BITS+1) */
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1,
+ CONST_BITS-PASS1_BITS+1) */
+ sll s4, t9, 2
+ lw v0, 4(a2) /* output_buf[ctr] */
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+ /* 3 */
+ li s4, 15137
+ lw s6, 72(t1) /* wsptr[2] */
+ li s5, 6270
+ lw s7, 88(t1) /* wsptr[6] */
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
+ FIX_1_847759065) */
+ lw t2, 64(t1) /* wsptr[0] */
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
+ -FIX_0_765366865) */
+ lh t5, 92(t1) /* wsptr[7] */
+ lh t6, 84(t1) /* wsptr[5] */
+ lh t7, 76(t1) /* wsptr[3] */
+ lh t8, 68(t1) /* wsptr[1] */
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 /* tmp0 =
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+ mflo s6, $ac0
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
+ subu s4, s4, s5
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
+ mflo s7, $ac1
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
+ sll s4, t9, 2
+ lw v0, 8(a2) /* output_buf[ctr] */
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+ li s4, 15137
+ lw s6, 104(t1) /* wsptr[2] */
+ li s5, 6270
+ lw s7, 120(t1) /* wsptr[6] */
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
+ FIX_1_847759065) */
+ lw t2, 96(t1) /* wsptr[0] */
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
+ -FIX_0_765366865) */
+ lh t5, 124(t1) /* wsptr[7] */
+ lh t6, 116(t1) /* wsptr[5] */
+ lh t7, 108(t1) /* wsptr[3] */
+ lh t8, 100(t1) /* wsptr[1] */
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 /* tmp0 =
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+ mflo s6, $ac0
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
+ subu s4, s4, s5
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2; */
+ mflo s7, $ac1
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2; */
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
+ sll s4, t9, 2
+ lw v0, 12(a2) /* output_buf[ctr] */
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_idct_4x4_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_6x6_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ addiu sp, sp, -144
+ move v0, sp
+ addiu v1, v0, 24
+ addiu t9, zero, 5793
+ addiu s0, zero, 10033
+ addiu s1, zero, 2998
+
+1:
+ lh s2, 0(a0) /* q0 = quantptr[ 0] */
+ lh s3, 32(a0) /* q1 = quantptr[16] */
+ lh s4, 64(a0) /* q2 = quantptr[32] */
+ lh t2, 64(a1) /* tmp2 = inptr[32] */
+ lh t1, 32(a1) /* tmp1 = inptr[16] */
+ lh t0, 0(a1) /* tmp0 = inptr[ 0] */
+ mul t2, t2, s4 /* tmp2 = tmp2 * q2 */
+ mul t1, t1, s3 /* tmp1 = tmp1 * q1 */
+ mul t0, t0, s2 /* tmp0 = tmp0 * q0 */
+ lh t6, 16(a1) /* z1 = inptr[ 8] */
+ lh t8, 80(a1) /* z3 = inptr[40] */
+ lh t7, 48(a1) /* z2 = inptr[24] */
+ lh s2, 16(a0) /* q0 = quantptr[ 8] */
+ lh s4, 80(a0) /* q2 = quantptr[40] */
+ lh s3, 48(a0) /* q1 = quantptr[24] */
+ mul t2, t2, t9 /* tmp2 = tmp2 * 5793 */
+ mul t1, t1, s0 /* tmp1 = tmp1 * 10033 */
+ sll t0, t0, 13 /* tmp0 = tmp0 << 13 */
+ mul t6, t6, s2 /* z1 = z1 * q0 */
+ mul t8, t8, s4 /* z3 = z3 * q2 */
+ mul t7, t7, s3 /* z2 = z2 * q1 */
+ addu t3, t0, t2 /* tmp10 = tmp0 + tmp2 */
+ sll t2, t2, 1 /* tmp2 = tmp2 << 2 */
+ subu t4, t0, t2 /* tmp11 = tmp0 - tmp2; */
+ subu t5, t3, t1 /* tmp12 = tmp10 - tmp1 */
+ addu t3, t3, t1 /* tmp10 = tmp10 + tmp1 */
+ addu t1, t6, t8 /* tmp1 = z1 + z3 */
+ mul t1, t1, s1 /* tmp1 = tmp1 * 2998 */
+ shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */
+ subu t2, t6, t8 /* tmp2 = z1 - z3 */
+ subu t2, t2, t7 /* tmp2 = tmp2 - z2 */
+ sll t2, t2, 2 /* tmp2 = tmp2 << 2 */
+ addu t0, t6, t7 /* tmp0 = z1 + z2 */
+ sll t0, t0, 13 /* tmp0 = tmp0 << 13 */
+ subu s2, t8, t7 /* q0 = z3 - z2 */
+ sll s2, s2, 13 /* q0 = q0 << 13 */
+ addu t0, t0, t1 /* tmp0 = tmp0 + tmp1 */
+ addu t1, s2, t1 /* tmp1 = q0 + tmp1 */
+ addu s2, t4, t2 /* q0 = tmp11 + tmp2 */
+ subu s3, t4, t2 /* q1 = tmp11 - tmp2 */
+ addu t6, t3, t0 /* z1 = tmp10 + tmp0 */
+ subu t7, t3, t0 /* z2 = tmp10 - tmp0 */
+ addu t4, t5, t1 /* tmp11 = tmp12 + tmp1 */
+ subu t5, t5, t1 /* tmp12 = tmp12 - tmp1 */
+ shra_r.w t6, t6, 11 /* z1 = (z1 + 1024) >> 11 */
+ shra_r.w t7, t7, 11 /* z2 = (z2 + 1024) >> 11 */
+ shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */
+ shra_r.w t5, t5, 11 /* tmp12 = (tmp12 + 1024) >> 11 */
+ sw s2, 24(v0)
+ sw s3, 96(v0)
+ sw t6, 0(v0)
+ sw t7, 120(v0)
+ sw t4, 48(v0)
+ sw t5, 72(v0)
+ addiu v0, v0, 4
+ addiu a1, a1, 2
+ bne v0, v1, 1b
+ addiu a0, a0, 2
+
+ /* Pass 2: process 6 rows from work array, store into output array. */
+ move v0, sp
+ addiu v1, v0, 144
+
+2:
+ lw t0, 0(v0)
+ lw t2, 16(v0)
+ lw s5, 0(a2)
+ addiu t0, t0, 16
+ sll t0, t0, 13
+ mul t3, t2, t9
+ lw t6, 4(v0)
+ lw t8, 20(v0)
+ lw t7, 12(v0)
+ addu s5, s5, a3
+ addu s6, t6, t8
+ mul s6, s6, s1
+ addu t1, t0, t3
+ subu t4, t0, t3
+ subu t4, t4, t3
+ lw t3, 8(v0)
+ mul t0, t3, s0
+ addu s7, t6, t7
+ sll s7, s7, 13
+ addu s7, s6, s7
+ subu t2, t8, t7
+ sll t2, t2, 13
+ addu t2, s6, t2
+ subu s6, t6, t7
+ subu s6, s6, t8
+ sll s6, s6, 13
+ addu t3, t1, t0
+ subu t5, t1, t0
+ addu t6, t3, s7
+ subu t3, t3, s7
+ addu t7, t4, s6
+ subu t4, t4, s6
+ addu t8, t5, t2
+ subu t5, t5, t2
+ shll_s.w t6, t6, 6
+ shll_s.w t3, t3, 6
+ shll_s.w t7, t7, 6
+ shll_s.w t4, t4, 6
+ shll_s.w t8, t8, 6
+ shll_s.w t5, t5, 6
+ sra t6, t6, 24
+ addiu t6, t6, 128
+ sra t3, t3, 24
+ addiu t3, t3, 128
+ sb t6, 0(s5)
+ sra t7, t7, 24
+ addiu t7, t7, 128
+ sb t3, 5(s5)
+ sra t4, t4, 24
+ addiu t4, t4, 128
+ sb t7, 1(s5)
+ sra t8, t8, 24
+ addiu t8, t8, 128
+ sb t4, 4(s5)
+ addiu v0, v0, 24
+ sra t5, t5, 24
+ addiu t5, t5, 128
+ sb t8, 2(s5)
+ addiu a2, a2, 4
+ bne v0, v1, 2b
+ sb t5, 3(s5)
+
+ addiu sp, sp, 144
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_idct_6x6_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = workspace
+ */
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ li a3, 8
+
+1:
+ /* odd part */
+ lh t0, 48(a1)
+ lh t1, 48(a0)
+ lh t2, 16(a1)
+ lh t3, 16(a0)
+ lh t4, 80(a1)
+ lh t5, 80(a0)
+ lh t6, 112(a1)
+ lh t7, 112(a0)
+ mul t0, t0, t1 /* z2 */
+ mul t1, t2, t3 /* z1 */
+ mul t2, t4, t5 /* z3 */
+ mul t3, t6, t7 /* z4 */
+ li t4, 10703 /* FIX(1.306562965) */
+ li t5, 4433 /* FIX_0_541196100 */
+ li t6, 7053 /* FIX(0.860918669) */
+ mul t4, t0, t4 /* tmp11 */
+ mul t5, t0, t5 /* -tmp14 */
+ addu t7, t1, t2 /* tmp10 */
+ addu t8, t7, t3 /* tmp10 + z4 */
+ mul t6, t6, t8 /* tmp15 */
+ li t8, 2139 /* FIX(0.261052384) */
+ mul t8, t7, t8 /* MULTIPLY(tmp10, FIX(0.261052384)) */
+ li t7, 2295 /* FIX(0.280143716) */
+ mul t7, t1, t7 /* MULTIPLY(z1, FIX(0.280143716)) */
+ addu t9, t2, t3 /* z3 + z4 */
+ li s0, 8565 /* FIX(1.045510580) */
+ mul t9, t9, s0 /* -tmp13 */
+ li s0, 12112 /* FIX(1.478575242) */
+ mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242) */
+ li s1, 12998 /* FIX(1.586706681) */
+ mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */
+ li s2, 5540 /* FIX(0.676326758) */
+ mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */
+ li s3, 16244 /* FIX(1.982889723) */
+ mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */
+ subu t1, t1, t3 /* z1-=z4 */
+ subu t0, t0, t2 /* z2-=z3 */
+ addu t2, t0, t1 /* z1+z2 */
+ li t3, 4433 /* FIX_0_541196100 */
+ mul t2, t2, t3 /* z3 */
+ li t3, 6270 /* FIX_0_765366865 */
+ mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */
+ li t3, 15137 /* FIX_0_765366865 */
+ mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */
+ addu t8, t6, t8 /* tmp12 */
+ addu t3, t8, t4 /* tmp12 + tmp11 */
+ addu t3, t3, t7 /* tmp10 */
+ subu t8, t8, t9 /* tmp12 + tmp13 */
+ addu s0, t5, s0
+ subu t8, t8, s0 /* tmp12 */
+ subu t9, t6, t9
+ subu s1, s1, t4
+ addu t9, t9, s1 /* tmp13 */
+ subu t6, t6, t5
+ subu t6, t6, s2
+ subu t6, t6, s3 /* tmp15 */
+ /* even part start */
+ lh t4, 64(a1)
+ lh t5, 64(a0)
+ lh t7, 32(a1)
+ lh s0, 32(a0)
+ lh s1, 0(a1)
+ lh s2, 0(a0)
+ lh s3, 96(a1)
+ lh v0, 96(a0)
+ mul t4, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*4],
+ quantptr[DCTSIZE*4]) */
+ mul t5, t7, s0 /* DEQUANTIZE(inptr[DCTSIZE*2],
+ quantptr[DCTSIZE*2]) */
+ mul t7, s1, s2 /* DEQUANTIZE(inptr[DCTSIZE*0],
+ quantptr[DCTSIZE*0]) */
+ mul s0, s3, v0 /* DEQUANTIZE(inptr[DCTSIZE*6],
+ quantptr[DCTSIZE*6]) */
+ /* odd part end */
+ addu t1, t2, t1 /* tmp11 */
+ subu t0, t2, t0 /* tmp14 */
+ /* update counter and pointers */
+ addiu a3, a3, -1
+ addiu a0, a0, 2
+ addiu a1, a1, 2
+ /* even part rest */
+ li s1, 10033
+ li s2, 11190
+ mul t4, t4, s1 /* z4 */
+ mul s1, t5, s2 /* z4 */
+ sll t5, t5, 13 /* z1 */
+ sll t7, t7, 13
+ addiu t7, t7, 1024 /* z3 */
+ sll s0, s0, 13 /* z2 */
+ addu s2, t7, t4 /* tmp10 */
+ subu t4, t7, t4 /* tmp11 */
+ subu s3, t5, s0 /* tmp12 */
+ addu t2, t7, s3 /* tmp21 */
+ subu s3, t7, s3 /* tmp24 */
+ addu t7, s1, s0 /* tmp12 */
+ addu v0, s2, t7 /* tmp20 */
+ subu s2, s2, t7 /* tmp25 */
+ subu s1, s1, t5 /* z4 - z1 */
+ subu s1, s1, s0 /* tmp12 */
+ addu s0, t4, s1 /* tmp22 */
+ subu t4, t4, s1 /* tmp23 */
+ /* final output stage */
+ addu t5, v0, t3
+ subu v0, v0, t3
+ addu t3, t2, t1
+ subu t2, t2, t1
+ addu t1, s0, t8
+ subu s0, s0, t8
+ addu t8, t4, t9
+ subu t4, t4, t9
+ addu t9, s3, t0
+ subu s3, s3, t0
+ addu t0, s2, t6
+ subu s2, s2, t6
+ sra t5, t5, 11
+ sra t3, t3, 11
+ sra t1, t1, 11
+ sra t8, t8, 11
+ sra t9, t9, 11
+ sra t0, t0, 11
+ sra s2, s2, 11
+ sra s3, s3, 11
+ sra t4, t4, 11
+ sra s0, s0, 11
+ sra t2, t2, 11
+ sra v0, v0, 11
+ sw t5, 0(a2)
+ sw t3, 32(a2)
+ sw t1, 64(a2)
+ sw t8, 96(a2)
+ sw t9, 128(a2)
+ sw t0, 160(a2)
+ sw s2, 192(a2)
+ sw s3, 224(a2)
+ sw t4, 256(a2)
+ sw s0, 288(a2)
+ sw t2, 320(a2)
+ sw v0, 352(a2)
+ bgtz a3, 1b
+ addiu a2, a2, 4
+
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ j ra
+ nop
+
+END(jsimd_idct_12x12_pass1_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
+/*
+ * a0 = workspace
+ * a1 = output
+ */
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ li a3, 12
+
+1:
+ /* Odd part */
+ lw t0, 12(a0)
+ lw t1, 4(a0)
+ lw t2, 20(a0)
+ lw t3, 28(a0)
+ li t4, 10703 /* FIX(1.306562965) */
+ li t5, 4433 /* FIX_0_541196100 */
+ mul t4, t0, t4 /* tmp11 */
+ mul t5, t0, t5 /* -tmp14 */
+ addu t6, t1, t2 /* tmp10 */
+ li t7, 2139 /* FIX(0.261052384) */
+ mul t7, t6, t7 /* MULTIPLY(tmp10, FIX(0.261052384)) */
+ addu t6, t6, t3 /* tmp10 + z4 */
+ li t8, 7053 /* FIX(0.860918669) */
+ mul t6, t6, t8 /* tmp15 */
+ li t8, 2295 /* FIX(0.280143716) */
+ mul t8, t1, t8 /* MULTIPLY(z1, FIX(0.280143716)) */
+ addu t9, t2, t3 /* z3 + z4 */
+ li s0, 8565 /* FIX(1.045510580) */
+ mul t9, t9, s0 /* -tmp13 */
+ li s0, 12112 /* FIX(1.478575242) */
+ mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242)) */
+ li s1, 12998 /* FIX(1.586706681) */
+ mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */
+ li s2, 5540 /* FIX(0.676326758) */
+ mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */
+ li s3, 16244 /* FIX(1.982889723) */
+ mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */
+ subu t1, t1, t3 /* z1 -= z4 */
+ subu t0, t0, t2 /* z2 -= z3 */
+ addu t2, t1, t0 /* z1 + z2 */
+ li t3, 4433 /* FIX_0_541196100 */
+ mul t2, t2, t3 /* z3 */
+ li t3, 6270 /* FIX_0_765366865 */
+ mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */
+ li t3, 15137 /* FIX_1_847759065 */
+ mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */
+ addu t3, t6, t7 /* tmp12 */
+ addu t7, t3, t4
+ addu t7, t7, t8 /* tmp10 */
+ subu t3, t3, t9
+ subu t3, t3, t5
+ subu t3, t3, s0 /* tmp12 */
+ subu t9, t6, t9
+ subu t9, t9, t4
+ addu t9, t9, s1 /* tmp13 */
+ subu t6, t6, t5
+ subu t6, t6, s2
+ subu t6, t6, s3 /* tmp15 */
+ addu t1, t2, t1 /* tmp11 */
+ subu t0, t2, t0 /* tmp14 */
+ /* even part */
+ lw t2, 16(a0) /* z4 */
+ lw t4, 8(a0) /* z1 */
+ lw t5, 0(a0) /* z3 */
+ lw t8, 24(a0) /* z2 */
+ li s0, 10033 /* FIX(1.224744871) */
+ li s1, 11190 /* FIX(1.366025404) */
+ mul t2, t2, s0 /* z4 */
+ mul s0, t4, s1 /* z4 */
+ addiu t5, t5, 0x10
+ sll t5, t5, 13 /* z3 */
+ sll t4, t4, 13 /* z1 */
+ sll t8, t8, 13 /* z2 */
+ subu s1, t4, t8 /* tmp12 */
+ addu s2, t5, t2 /* tmp10 */
+ subu t2, t5, t2 /* tmp11 */
+ addu s3, t5, s1 /* tmp21 */
+ subu s1, t5, s1 /* tmp24 */
+ addu t5, s0, t8 /* tmp12 */
+ addu v0, s2, t5 /* tmp20 */
+ subu t5, s2, t5 /* tmp25 */
+ subu t4, s0, t4
+ subu t4, t4, t8 /* tmp12 */
+ addu t8, t2, t4 /* tmp22 */
+ subu t2, t2, t4 /* tmp23 */
+ /* increment counter and pointers */
+ addiu a3, a3, -1
+ addiu a0, a0, 32
+ /* Final stage */
+ addu t4, v0, t7
+ subu v0, v0, t7
+ addu t7, s3, t1
+ subu s3, s3, t1
+ addu t1, t8, t3
+ subu t8, t8, t3
+ addu t3, t2, t9
+ subu t2, t2, t9
+ addu t9, s1, t0
+ subu s1, s1, t0
+ addu t0, t5, t6
+ subu t5, t5, t6
+ sll t4, t4, 4
+ sll t7, t7, 4
+ sll t1, t1, 4
+ sll t3, t3, 4
+ sll t9, t9, 4
+ sll t0, t0, 4
+ sll t5, t5, 4
+ sll s1, s1, 4
+ sll t2, t2, 4
+ sll t8, t8, 4
+ sll s3, s3, 4
+ sll v0, v0, 4
+ shll_s.w t4, t4, 2
+ shll_s.w t7, t7, 2
+ shll_s.w t1, t1, 2
+ shll_s.w t3, t3, 2
+ shll_s.w t9, t9, 2
+ shll_s.w t0, t0, 2
+ shll_s.w t5, t5, 2
+ shll_s.w s1, s1, 2
+ shll_s.w t2, t2, 2
+ shll_s.w t8, t8, 2
+ shll_s.w s3, s3, 2
+ shll_s.w v0, v0, 2
+ srl t4, t4, 24
+ srl t7, t7, 24
+ srl t1, t1, 24
+ srl t3, t3, 24
+ srl t9, t9, 24
+ srl t0, t0, 24
+ srl t5, t5, 24
+ srl s1, s1, 24
+ srl t2, t2, 24
+ srl t8, t8, 24
+ srl s3, s3, 24
+ srl v0, v0, 24
+ lw t6, 0(a1)
+ addiu t4, t4, 0x80
+ addiu t7, t7, 0x80
+ addiu t1, t1, 0x80
+ addiu t3, t3, 0x80
+ addiu t9, t9, 0x80
+ addiu t0, t0, 0x80
+ addiu t5, t5, 0x80
+ addiu s1, s1, 0x80
+ addiu t2, t2, 0x80
+ addiu t8, t8, 0x80
+ addiu s3, s3, 0x80
+ addiu v0, v0, 0x80
+ sb t4, 0(t6)
+ sb t7, 1(t6)
+ sb t1, 2(t6)
+ sb t3, 3(t6)
+ sb t9, 4(t6)
+ sb t0, 5(t6)
+ sb t5, 6(t6)
+ sb s1, 7(t6)
+ sb t2, 8(t6)
+ sb t8, 9(t6)
+ sb s3, 10(t6)
+ sb v0, 11(t6)
+ bgtz a3, 1b
+ addiu a1, a1, 4
+
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ jr ra
+ nop
+
+END(jsimd_idct_12x12_pass2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+ lw t0, 0(a0)
+ li t7, 0xff80ff80
+ addu t0, t0, a1
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ lw t0, 4(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 0(a2)
+ usw t4, 4(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 8(a2)
+ usw t6, 12(a2)
+
+ lw t0, 8(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 16(a2)
+ usw t4, 20(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 24(a2)
+ usw t6, 28(a2)
+
+ lw t0, 12(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 32(a2)
+ usw t4, 36(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 40(a2)
+ usw t6, 44(a2)
+
+ lw t0, 16(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 48(a2)
+ usw t4, 52(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 56(a2)
+ usw t6, 60(a2)
+
+ lw t0, 20(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 64(a2)
+ usw t4, 68(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 72(a2)
+ usw t6, 76(a2)
+
+ lw t0, 24(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 80(a2)
+ usw t4, 84(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 88(a2)
+ usw t6, 92(a2)
+
+ lw t0, 28(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 96(a2)
+ usw t4, 100(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 104(a2)
+ usw t6, 108(a2)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 112(a2)
+ usw t4, 116(a2)
+ usw t5, 120(a2)
+ usw t6, 124(a2)
+
+ j ra
+ nop
+
+END(jsimd_convsamp_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_float_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+ .set at
+
+ lw t0, 0(a0)
+ addu t0, t0, a1
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 4(a0)
+ swc1 f2, 0(a2)
+ swc1 f4, 4(a2)
+ swc1 f6, 8(a2)
+ addu t0, t0, a1
+ swc1 f8, 12(a2)
+ swc1 f10, 16(a2)
+ swc1 f12, 20(a2)
+ swc1 f14, 24(a2)
+ swc1 f16, 28(a2)
+ /* elemr 1 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 8(a0)
+ swc1 f2, 32(a2)
+ swc1 f4, 36(a2)
+ swc1 f6, 40(a2)
+ addu t0, t0, a1
+ swc1 f8, 44(a2)
+ swc1 f10, 48(a2)
+ swc1 f12, 52(a2)
+ swc1 f14, 56(a2)
+ swc1 f16, 60(a2)
+ /* elemr 2 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 12(a0)
+ swc1 f2, 64(a2)
+ swc1 f4, 68(a2)
+ swc1 f6, 72(a2)
+ addu t0, t0, a1
+ swc1 f8, 76(a2)
+ swc1 f10, 80(a2)
+ swc1 f12, 84(a2)
+ swc1 f14, 88(a2)
+ swc1 f16, 92(a2)
+ /* elemr 3 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 16(a0)
+ swc1 f2, 96(a2)
+ swc1 f4, 100(a2)
+ swc1 f6, 104(a2)
+ addu t0, t0, a1
+ swc1 f8, 108(a2)
+ swc1 f10, 112(a2)
+ swc1 f12, 116(a2)
+ swc1 f14, 120(a2)
+ swc1 f16, 124(a2)
+ /* elemr 4 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 20(a0)
+ swc1 f2, 128(a2)
+ swc1 f4, 132(a2)
+ swc1 f6, 136(a2)
+ addu t0, t0, a1
+ swc1 f8, 140(a2)
+ swc1 f10, 144(a2)
+ swc1 f12, 148(a2)
+ swc1 f14, 152(a2)
+ swc1 f16, 156(a2)
+ /* elemr 5 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 24(a0)
+ swc1 f2, 160(a2)
+ swc1 f4, 164(a2)
+ swc1 f6, 168(a2)
+ addu t0, t0, a1
+ swc1 f8, 172(a2)
+ swc1 f10, 176(a2)
+ swc1 f12, 180(a2)
+ swc1 f14, 184(a2)
+ swc1 f16, 188(a2)
+ /* elemr 6 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 28(a0)
+ swc1 f2, 192(a2)
+ swc1 f4, 196(a2)
+ swc1 f6, 200(a2)
+ addu t0, t0, a1
+ swc1 f8, 204(a2)
+ swc1 f10, 208(a2)
+ swc1 f12, 212(a2)
+ swc1 f14, 216(a2)
+ swc1 f16, 220(a2)
+ /* elemr 7 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ swc1 f2, 224(a2)
+ swc1 f4, 228(a2)
+ swc1 f6, 232(a2)
+ swc1 f8, 236(a2)
+ swc1 f10, 240(a2)
+ swc1 f12, 244(a2)
+ swc1 f14, 248(a2)
+ swc1 f16, 252(a2)
+
+ j ra
+ nop
+
+END(jsimd_convsamp_float_dspr2)
+
+#endif
+
+/*****************************************************************************/
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2_asm.h b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
new file mode 100644
index 0000000000..12cfda486c
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
@@ -0,0 +1,292 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * Copyright (C) 2018, Matthieu Darbois.
+ * All Rights Reserved.
+ * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
+ * Darko Laus (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero $0
+#define AT $1
+#define v0 $2
+#define v1 $3
+#define a0 $4
+#define a1 $5
+#define a2 $6
+#define a3 $7
+#define t0 $8
+#define t1 $9
+#define t2 $10
+#define t3 $11
+#define t4 $12
+#define t5 $13
+#define t6 $14
+#define t7 $15
+#define s0 $16
+#define s1 $17
+#define s2 $18
+#define s3 $19
+#define s4 $20
+#define s5 $21
+#define s6 $22
+#define s7 $23
+#define t8 $24
+#define t9 $25
+#define k0 $26
+#define k1 $27
+#define gp $28
+#define sp $29
+#define fp $30
+#define s8 $30
+#define ra $31
+
+#define f0 $f0
+#define f1 $f1
+#define f2 $f2
+#define f3 $f3
+#define f4 $f4
+#define f5 $f5
+#define f6 $f6
+#define f7 $f7
+#define f8 $f8
+#define f9 $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#ifdef __ELF__
+#define HIDDEN_SYMBOL(symbol) .hidden symbol;
+#else
+#define HIDDEN_SYMBOL(symbol)
+#endif
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol) \
+ .globl symbol; \
+ HIDDEN_SYMBOL(symbol) \
+ .align 2; \
+ .type symbol, @function; \
+ .ent symbol, 0; \
+symbol: \
+ .frame sp, 0, ra; \
+ .set push; \
+ .set arch = mips32r2; \
+ .set noreorder; \
+ .set noat;
+
+/*
+ * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_DSPR2(symbol) \
+LEAF_MIPS32R2(symbol) \
+ .set dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function) \
+ .set pop; \
+ .end function; \
+ .size function, .-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+ .error "Stack offset must be pozitive and multiple of 4."
+.endif
+.if \stack_offset != 0
+ addiu sp, sp, -\stack_offset
+.endif
+ sw \r1, 0(sp)
+.if \r2 != 0
+ sw \r2, 4(sp)
+.endif
+.if \r3 != 0
+ sw \r3, 8(sp)
+.endif
+.if \r4 != 0
+ sw \r4, 12(sp)
+.endif
+.if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ sw \r5, 16(sp)
+.endif
+.if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ sw \r6, 20(sp)
+.endif
+.if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ sw \r7, 24(sp)
+.endif
+.if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ sw \r8, 28(sp)
+.endif
+.if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ sw \r9, 32(sp)
+.endif
+.if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ sw \r10, 36(sp)
+.endif
+.if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ sw \r11, 40(sp)
+.endif
+.if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ sw \r12, 44(sp)
+.endif
+.if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ sw \r13, 48(sp)
+.endif
+.if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ sw \r14, 52(sp)
+.endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+ .error "Stack offset must be pozitive and multiple of 4."
+.endif
+ lw \r1, 0(sp)
+.if \r2 != 0
+ lw \r2, 4(sp)
+.endif
+.if \r3 != 0
+ lw \r3, 8(sp)
+.endif
+.if \r4 != 0
+ lw \r4, 12(sp)
+.endif
+.if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ lw \r5, 16(sp)
+.endif
+.if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ lw \r6, 20(sp)
+.endif
+.if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ lw \r7, 24(sp)
+.endif
+.if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ lw \r8, 28(sp)
+.endif
+.if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ lw \r9, 32(sp)
+.endif
+.if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ lw \r10, 36(sp)
+.endif
+.if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ lw \r11, 40(sp)
+.endif
+.if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ lw \r12, 44(sp)
+.endif
+.if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ lw \r13, 48(sp)
+.endif
+.if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ lw \r14, 52(sp)
+.endif
+.if \stack_offset != 0
+ addiu sp, sp, \stack_offset
+.endif
+.endm
diff --git a/media/libjpeg/simd/mips64/jccolext-mmi.c b/media/libjpeg/simd/mips64/jccolext-mmi.c
new file mode 100644
index 0000000000..558eb2ab10
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolext-mmi.c
@@ -0,0 +1,455 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA re
+#define mmB ro
+#elif RGB_GREEN == 0
+#define mmA ge
+#define mmB go
+#elif RGB_BLUE == 0
+#define mmA be
+#define mmB bo
+#else
+#define mmA xe
+#define mmB xo
+#endif
+
+#if RGB_RED == 1
+#define mmC re
+#define mmD ro
+#elif RGB_GREEN == 1
+#define mmC ge
+#define mmD go
+#elif RGB_BLUE == 1
+#define mmC be
+#define mmD bo
+#else
+#define mmC xe
+#define mmD xo
+#endif
+
+#if RGB_RED == 2
+#define mmE re
+#define mmF ro
+#elif RGB_GREEN == 2
+#define mmE ge
+#define mmF go
+#elif RGB_BLUE == 2
+#define mmE be
+#define mmF bo
+#else
+#define mmE xe
+#define mmF xo
+#endif
+
+#if RGB_RED == 3
+#define mmG re
+#define mmH ro
+#elif RGB_GREEN == 3
+#define mmG ge
+#define mmH go
+#elif RGB_BLUE == 3
+#define mmG be
+#define mmH bo
+#else
+#define mmG xe
+#define mmH xo
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ JSAMPROW inptr, outptr0, outptr1, outptr2;
+ int num_cols, col;
+ __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+ __m64 xo;
+#endif
+ __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+ __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+ __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+ __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+ __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+ __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+ __m64 crle, crhe, cre, crlo, crho, cro, cr;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+ outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+ if (num_cols < 8) {
+ col = num_cols * 3;
+ asm(".set noreorder\r\n"
+
+ "li $8, 1\r\n"
+ "move $9, %3\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 1f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 1\r\n"
+ "xor $12, $12, $12\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lbu $12, 0($13)\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 2f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 2\r\n"
+ "xor $11, $11, $11\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lhu $11, 0($13)\r\n"
+ "sll $12, $12, 16\r\n"
+ "or $12, $12, $11\r\n"
+
+ "2: \r\n"
+ "dmtc1 $12, %0\r\n"
+ "li $8, 4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 3f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 4\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lwu $14, 0($13)\r\n"
+ "dmtc1 $14, %1\r\n"
+ "dsll32 $12, $12, 0\r\n"
+ "or $12, $12, $14\r\n"
+ "dmtc1 $12, %0\r\n"
+
+ "3: \r\n"
+ "li $8, 8\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 4f\r\n"
+ "nop \r\n"
+ "mov.s %1, %0\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "li $9, 8\r\n"
+ "j 5f\r\n"
+ "nop \r\n"
+
+ "4: \r\n"
+ "li $8, 16\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 5f\r\n"
+ "nop \r\n"
+ "mov.s %2, %0\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "ldc1 %1, 8(%5)\r\n"
+
+ "5: \r\n"
+ "nop \r\n"
+ ".set reorder\r\n"
+
+ : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+ : "r" (col), "r" (num_rows), "r" (inptr)
+ : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+ "$14", "memory"
+ );
+ } else {
+ if (!(((long)inptr) & 7)) {
+ mmA = _mm_load_si64((__m64 *)&inptr[0]);
+ mmG = _mm_load_si64((__m64 *)&inptr[8]);
+ mmF = _mm_load_si64((__m64 *)&inptr[16]);
+ } else {
+ mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+ mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+ mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+ }
+ inptr += RGB_PIXELSIZE * 8;
+ }
+ mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+ mmA = _mm_unpackhi_pi8(mmA, mmG);
+ mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+ mmD = _mm_unpacklo_pi8(mmD, mmF);
+ mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+ mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+ mmA = _mm_unpackhi_pi8(mmA, mmD);
+ mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+ mmE = _mm_unpacklo_pi8(mmE, mmG);
+ mmD = _mm_unpackhi_pi8(mmD, mmG);
+ mmC = _mm_loadhi_pi8_f(mmA);
+ mmA = _mm_loadlo_pi8_f(mmA);
+
+ mmB = _mm_loadhi_pi8_f(mmE);
+ mmE = _mm_loadlo_pi8_f(mmE);
+
+ mmF = _mm_loadhi_pi8_f(mmD);
+ mmD = _mm_loadlo_pi8_f(mmD);
+
+#else /* RGB_PIXELSIZE == 4 */
+
+ if (num_cols < 8) {
+ col = num_cols;
+ asm(".set noreorder\r\n"
+
+ "li $8, 1\r\n"
+ "move $9, %4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 1f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 1\r\n"
+ PTR_SLL "$11, $9, 2\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $11\r\n"
+ "lwc1 %0, 0($13)\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 2f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_SLL "$11, $9, 2\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $11\r\n"
+ "mov.s %1, %0\r\n"
+ "ldc1 %0, 0($13)\r\n"
+
+ "2: \r\n"
+ "li $8, 4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 3f\r\n"
+ "nop \r\n"
+ "mov.s %2, %0\r\n"
+ "mov.s %3, %1\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "ldc1 %1, 8(%5)\r\n"
+
+ "3: \r\n"
+ "nop \r\n"
+ ".set reorder\r\n"
+
+ : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+ : "r" (col), "r" (inptr)
+ : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+ );
+ } else {
+ if (!(((long)inptr) & 7)) {
+ mmA = _mm_load_si64((__m64 *)&inptr[0]);
+ mmF = _mm_load_si64((__m64 *)&inptr[8]);
+ mmD = _mm_load_si64((__m64 *)&inptr[16]);
+ mmC = _mm_load_si64((__m64 *)&inptr[24]);
+ } else {
+ mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+ mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+ mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+ mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+ }
+ inptr += RGB_PIXELSIZE * 8;
+ }
+ mmB = _mm_unpackhi_pi8(mmA, mmF);
+ mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+ mmG = _mm_unpackhi_pi8(mmD, mmC);
+ mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+ mmE = _mm_unpackhi_pi16(mmA, mmD);
+ mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+ mmH = _mm_unpackhi_pi16(mmB, mmG);
+ mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+ mmC = _mm_loadhi_pi8_f(mmA);
+ mmA = _mm_loadlo_pi8_f(mmA);
+
+ mmD = _mm_loadhi_pi8_f(mmB);
+ mmB = _mm_loadlo_pi8_f(mmB);
+
+ mmG = _mm_loadhi_pi8_f(mmE);
+ mmE = _mm_loadlo_pi8_f(mmE);
+
+ mmF = _mm_unpacklo_pi8(mmH, mmH);
+ mmH = _mm_unpackhi_pi8(mmH, mmH);
+ mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+ mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+ /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+ * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+ *
+ * (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ */
+
+ rglo = _mm_unpacklo_pi16(ro, go);
+ rgho = _mm_unpackhi_pi16(ro, go);
+ ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+ yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+ cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+ cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+ blo = _mm_loadlo_pi16_f(bo);
+ bho = _mm_loadhi_pi16_f(bo);
+ halfblo = _mm_srli_pi32(blo, 1);
+ halfbho = _mm_srli_pi32(bho, 1);
+
+ cblo = _mm_add_pi32(cblo, halfblo);
+ cbho = _mm_add_pi32(cbho, halfbho);
+ cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+ cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+ cblo = _mm_srli_pi32(cblo, SCALEBITS);
+ cbho = _mm_srli_pi32(cbho, SCALEBITS);
+ cbo = _mm_packs_pi32(cblo, cbho);
+
+ rgle = _mm_unpacklo_pi16(re, ge);
+ rghe = _mm_unpackhi_pi16(re, ge);
+ yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+ yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+ cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+ cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+ ble = _mm_loadlo_pi16_f(be);
+ bhe = _mm_loadhi_pi16_f(be);
+ halfble = _mm_srli_pi32(ble, 1);
+ halfbhe = _mm_srli_pi32(bhe, 1);
+
+ cble = _mm_add_pi32(cble, halfble);
+ cbhe = _mm_add_pi32(cbhe, halfbhe);
+ cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+ cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+ cble = _mm_srli_pi32(cble, SCALEBITS);
+ cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+ cbe = _mm_packs_pi32(cble, cbhe);
+
+ cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+ cb = _mm_or_si64(cbe, cbo);
+
+ bglo = _mm_unpacklo_pi16(bo, go);
+ bgho = _mm_unpackhi_pi16(bo, go);
+ ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+ yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+ crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+ crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+ ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+ yho = _mm_add_pi32(yho_bg, yho_rg);
+ ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+ yho = _mm_add_pi32(yho, PD_ONEHALF);
+ ylo = _mm_srli_pi32(ylo, SCALEBITS);
+ yho = _mm_srli_pi32(yho, SCALEBITS);
+ yo = _mm_packs_pi32(ylo, yho);
+
+ rlo = _mm_loadlo_pi16_f(ro);
+ rho = _mm_loadhi_pi16_f(ro);
+ halfrlo = _mm_srli_pi32(rlo, 1);
+ halfrho = _mm_srli_pi32(rho, 1);
+
+ crlo = _mm_add_pi32(crlo, halfrlo);
+ crho = _mm_add_pi32(crho, halfrho);
+ crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+ crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+ crlo = _mm_srli_pi32(crlo, SCALEBITS);
+ crho = _mm_srli_pi32(crho, SCALEBITS);
+ cro = _mm_packs_pi32(crlo, crho);
+
+ bgle = _mm_unpacklo_pi16(be, ge);
+ bghe = _mm_unpackhi_pi16(be, ge);
+ yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+ yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+ crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+ crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+ yle = _mm_add_pi32(yle_bg, yle_rg);
+ yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+ yle = _mm_add_pi32(yle, PD_ONEHALF);
+ yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+ yle = _mm_srli_pi32(yle, SCALEBITS);
+ yhe = _mm_srli_pi32(yhe, SCALEBITS);
+ ye = _mm_packs_pi32(yle, yhe);
+
+ yo = _mm_slli_pi16(yo, BYTE_BIT);
+ y = _mm_or_si64(ye, yo);
+
+ rle = _mm_loadlo_pi16_f(re);
+ rhe = _mm_loadhi_pi16_f(re);
+ halfrle = _mm_srli_pi32(rle, 1);
+ halfrhe = _mm_srli_pi32(rhe, 1);
+
+ crle = _mm_add_pi32(crle, halfrle);
+ crhe = _mm_add_pi32(crhe, halfrhe);
+ crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+ crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+ crle = _mm_srli_pi32(crle, SCALEBITS);
+ crhe = _mm_srli_pi32(crhe, SCALEBITS);
+ cre = _mm_packs_pi32(crle, crhe);
+
+ cro = _mm_slli_pi16(cro, BYTE_BIT);
+ cr = _mm_or_si64(cre, cro);
+
+ _mm_store_si64((__m64 *)&outptr0[0], y);
+ _mm_store_si64((__m64 *)&outptr1[0], cb);
+ _mm_store_si64((__m64 *)&outptr2[0], cr);
+ }
+ }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jccolor-mmi.c b/media/libjpeg/simd/mips64/jccolor-mmi.c
new file mode 100644
index 0000000000..93ef5c79f7
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolor-mmi.c
@@ -0,0 +1,148 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_081 ((short)5329) /* FIX(0.08131) */
+#define F_0_114 ((short)7471) /* FIX(0.11400) */
+#define F_0_168 ((short)11059) /* FIX(0.16874) */
+#define F_0_250 ((short)16384) /* FIX(0.25000) */
+#define F_0_299 ((short)19595) /* FIX(0.29900) */
+#define F_0_331 ((short)21709) /* FIX(0.33126) */
+#define F_0_418 ((short)27439) /* FIX(0.41869) */
+#define F_0_587 ((short)38470) /* FIX(0.58700) */
+#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+ index_PD_ONEHALF,
+ index_PW_F0299_F0337,
+ index_PW_F0114_F0250,
+ index_PW_MF016_MF033,
+ index_PW_MF008_MF041,
+ index_PD_ONEHALFM1_CJ
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+ _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+ _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114),
+ _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168),
+ _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081),
+ _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)),
+ ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)))
+};
+
+#define get_const_value(index) (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250)
+#define PW_MF016_MF033 get_const_value(index_PW_MF016_MF033)
+#define PW_MF008_MF041 get_const_value(index_PW_MF008_MF041)
+#define PD_ONEHALFM1_CJ get_const_value(index_PD_ONEHALFM1_CJ)
+
+
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extrgbx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extbgrx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extxbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extxrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgray-mmi.c b/media/libjpeg/simd/mips64/jcgray-mmi.c
new file mode 100644
index 0000000000..9c7b833f2e
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgray-mmi.c
@@ -0,0 +1,132 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_114 ((short)7471) /* FIX(0.11400) */
+#define F_0_250 ((short)16384) /* FIX(0.25000) */
+#define F_0_299 ((short)19595) /* FIX(0.29900) */
+#define F_0_587 ((short)38470) /* FIX(0.58700) */
+#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+ index_PD_ONEHALF,
+ index_PW_F0299_F0337,
+ index_PW_F0114_F0250
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+ _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+ _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114)
+};
+
+#define get_const_value(index) (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250)
+
+
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extrgbx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extbgrx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extxbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extxrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgryext-mmi.c b/media/libjpeg/simd/mips64/jcgryext-mmi.c
new file mode 100644
index 0000000000..08a83d6699
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgryext-mmi.c
@@ -0,0 +1,374 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA re
+#define mmB ro
+#elif RGB_GREEN == 0
+#define mmA ge
+#define mmB go
+#elif RGB_BLUE == 0
+#define mmA be
+#define mmB bo
+#else
+#define mmA xe
+#define mmB xo
+#endif
+
+#if RGB_RED == 1
+#define mmC re
+#define mmD ro
+#elif RGB_GREEN == 1
+#define mmC ge
+#define mmD go
+#elif RGB_BLUE == 1
+#define mmC be
+#define mmD bo
+#else
+#define mmC xe
+#define mmD xo
+#endif
+
+#if RGB_RED == 2
+#define mmE re
+#define mmF ro
+#elif RGB_GREEN == 2
+#define mmE ge
+#define mmF go
+#elif RGB_BLUE == 2
+#define mmE be
+#define mmF bo
+#else
+#define mmE xe
+#define mmF xo
+#endif
+
+#if RGB_RED == 3
+#define mmG re
+#define mmH ro
+#elif RGB_GREEN == 3
+#define mmG ge
+#define mmH go
+#elif RGB_BLUE == 3
+#define mmG be
+#define mmH bo
+#else
+#define mmG xe
+#define mmH xo
+#endif
+
+
+void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ JSAMPROW inptr, outptr;
+ int num_cols, col;
+ __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+ __m64 xo;
+#endif
+ __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+ __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+ __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+
+ for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+ outptr += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+ if (num_cols < 8) {
+ col = num_cols * 3;
+ asm(".set noreorder\r\n"
+
+ "li $8, 1\r\n"
+ "move $9, %3\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 1f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 1\r\n"
+ "xor $12, $12, $12\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lbu $12, 0($13)\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 2f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 2\r\n"
+ "xor $11, $11, $11\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lhu $11, 0($13)\r\n"
+ "sll $12, $12, 16\r\n"
+ "or $12, $12, $11\r\n"
+
+ "2: \r\n"
+ "dmtc1 $12, %0\r\n"
+ "li $8, 4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 3f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 4\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lwu $14, 0($13)\r\n"
+ "dmtc1 $14, %1\r\n"
+ "dsll32 $12, $12, 0\r\n"
+ "or $12, $12, $14\r\n"
+ "dmtc1 $12, %0\r\n"
+
+ "3: \r\n"
+ "li $8, 8\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 4f\r\n"
+ "nop \r\n"
+ "mov.s %1, %0\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "li $9, 8\r\n"
+ "j 5f\r\n"
+ "nop \r\n"
+
+ "4: \r\n"
+ "li $8, 16\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 5f\r\n"
+ "nop \r\n"
+ "mov.s %2, %0\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "ldc1 %1, 8(%5)\r\n"
+
+ "5: \r\n"
+ "nop \r\n"
+ ".set reorder\r\n"
+
+ : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+ : "r" (col), "r" (num_rows), "r" (inptr)
+ : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+ "$14", "memory"
+ );
+ } else {
+ if (!(((long)inptr) & 7)) {
+ mmA = _mm_load_si64((__m64 *)&inptr[0]);
+ mmG = _mm_load_si64((__m64 *)&inptr[8]);
+ mmF = _mm_load_si64((__m64 *)&inptr[16]);
+ } else {
+ mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+ mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+ mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+ }
+ inptr += RGB_PIXELSIZE * 8;
+ }
+ mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+ mmA = _mm_unpackhi_pi8(mmA, mmG);
+ mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+ mmD = _mm_unpacklo_pi8(mmD, mmF);
+ mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+ mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+ mmA = _mm_unpackhi_pi8(mmA, mmD);
+ mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+ mmE = _mm_unpacklo_pi8(mmE, mmG);
+ mmD = _mm_unpackhi_pi8(mmD, mmG);
+ mmC = _mm_loadhi_pi8_f(mmA);
+ mmA = _mm_loadlo_pi8_f(mmA);
+
+ mmB = _mm_loadhi_pi8_f(mmE);
+ mmE = _mm_loadlo_pi8_f(mmE);
+
+ mmF = _mm_loadhi_pi8_f(mmD);
+ mmD = _mm_loadlo_pi8_f(mmD);
+
+#else /* RGB_PIXELSIZE == 4 */
+
+ if (num_cols < 8) {
+ col = num_cols;
+ asm(".set noreorder\r\n"
+
+ "li $8, 1\r\n"
+ "move $9, %4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 1f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 1\r\n"
+ PTR_SLL "$11, $9, 2\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $11\r\n"
+ "lwc1 %0, 0($13)\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 2f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_SLL "$11, $9, 2\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $11\r\n"
+ "mov.s %1, %0\r\n"
+ "ldc1 %0, 0($13)\r\n"
+
+ "2: \r\n"
+ "li $8, 4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 3f\r\n"
+ "nop \r\n"
+ "mov.s %2, %0\r\n"
+ "mov.s %3, %1\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "ldc1 %1, 8(%5)\r\n"
+
+ "3: \r\n"
+ "nop \r\n"
+ ".set reorder\r\n"
+
+ : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+ : "r" (col), "r" (inptr)
+ : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+ );
+ } else {
+ if (!(((long)inptr) & 7)) {
+ mmA = _mm_load_si64((__m64 *)&inptr[0]);
+ mmF = _mm_load_si64((__m64 *)&inptr[8]);
+ mmD = _mm_load_si64((__m64 *)&inptr[16]);
+ mmC = _mm_load_si64((__m64 *)&inptr[24]);
+ } else {
+ mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+ mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+ mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+ mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+ }
+ inptr += RGB_PIXELSIZE * 8;
+ }
+ mmB = _mm_unpackhi_pi8(mmA, mmF);
+ mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+ mmG = _mm_unpackhi_pi8(mmD, mmC);
+ mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+ mmE = _mm_unpackhi_pi16(mmA, mmD);
+ mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+ mmH = _mm_unpackhi_pi16(mmB, mmG);
+ mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+ mmC = _mm_loadhi_pi8_f(mmA);
+ mmA = _mm_loadlo_pi8_f(mmA);
+
+ mmD = _mm_loadhi_pi8_f(mmB);
+ mmB = _mm_loadlo_pi8_f(mmB);
+
+ mmG = _mm_loadhi_pi8_f(mmE);
+ mmE = _mm_loadlo_pi8_f(mmE);
+
+ mmF = _mm_unpacklo_pi8(mmH, mmH);
+ mmH = _mm_unpackhi_pi8(mmH, mmH);
+ mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+ mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+ /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+ * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+ *
+ * (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ */
+
+ rglo = _mm_unpacklo_pi16(ro, go);
+ rgho = _mm_unpackhi_pi16(ro, go);
+ ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+ yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+
+ rgle = _mm_unpacklo_pi16(re, ge);
+ rghe = _mm_unpackhi_pi16(re, ge);
+ yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+ yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+
+ bglo = _mm_unpacklo_pi16(bo, go);
+ bgho = _mm_unpackhi_pi16(bo, go);
+ ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+ yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+
+ ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+ yho = _mm_add_pi32(yho_bg, yho_rg);
+ ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+ yho = _mm_add_pi32(yho, PD_ONEHALF);
+ ylo = _mm_srli_pi32(ylo, SCALEBITS);
+ yho = _mm_srli_pi32(yho, SCALEBITS);
+ yo = _mm_packs_pi32(ylo, yho);
+
+ bgle = _mm_unpacklo_pi16(be, ge);
+ bghe = _mm_unpackhi_pi16(be, ge);
+ yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+ yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+
+ yle = _mm_add_pi32(yle_bg, yle_rg);
+ yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+ yle = _mm_add_pi32(yle, PD_ONEHALF);
+ yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+ yle = _mm_srli_pi32(yle, SCALEBITS);
+ yhe = _mm_srli_pi32(yhe, SCALEBITS);
+ ye = _mm_packs_pi32(yle, yhe);
+
+ yo = _mm_slli_pi16(yo, BYTE_BIT);
+ y = _mm_or_si64(ye, yo);
+
+ _mm_store_si64((__m64 *)&outptr[0], y);
+ }
+ }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jcsample-mmi.c b/media/libjpeg/simd/mips64/jcsample-mmi.c
new file mode 100644
index 0000000000..0354dac087
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcsample-mmi.c
@@ -0,0 +1,98 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_mmi.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int inrow, outrow, outcol;
+ JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+ JSAMPROW inptr0, inptr1, outptr;
+ __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+ __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+ __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
+
+ expand_right_edge(input_data, max_v_samp_factor, image_width,
+ output_cols * 2);
+
+ bias = _mm_set1_pi32((1 << 17) + 1); /* 0x00020001 (32-bit bias pattern) */
+ /* bias={1, 2, 1, 2} (16-bit) */
+ mask = _mm_cmpeq_pi16(mask, mask);
+ mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
+
+ for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+ inrow += 2, outrow++) {
+
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ outptr = output_data[outrow];
+
+ for (outcol = output_cols; outcol > 0;
+ outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
+
+ this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+ this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+ next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+ next1 = _mm_load_si64((__m64 *)&inptr1[8]);
+
+ this0o = _mm_and_si64(this0, mask);
+ this0e = _mm_srli_pi16(this0, BYTE_BIT);
+ this1o = _mm_and_si64(this1, mask);
+ this1e = _mm_srli_pi16(this1, BYTE_BIT);
+ this0sum = _mm_add_pi16(this0o, this0e);
+ this1sum = _mm_add_pi16(this1o, this1e);
+
+ next0o = _mm_and_si64(next0, mask);
+ next0e = _mm_srli_pi16(next0, BYTE_BIT);
+ next1o = _mm_and_si64(next1, mask);
+ next1e = _mm_srli_pi16(next1, BYTE_BIT);
+ next0sum = _mm_add_pi16(next0o, next0e);
+ next1sum = _mm_add_pi16(next1o, next1e);
+
+ thisavg = _mm_add_pi16(this0sum, this1sum);
+ nextavg = _mm_add_pi16(next0sum, next1sum);
+ thisavg = _mm_add_pi16(thisavg, bias);
+ nextavg = _mm_add_pi16(nextavg, bias);
+ thisavg = _mm_srli_pi16(thisavg, 2);
+ nextavg = _mm_srli_pi16(nextavg, 2);
+
+ avg = _mm_packs_pu16(thisavg, nextavg);
+
+ _mm_store_si64((__m64 *)&outptr[0], avg);
+ }
+ }
+}
diff --git a/media/libjpeg/simd/jcsample.h b/media/libjpeg/simd/mips64/jcsample.h
index 2a50544e97..bd07fcc4ed 100644
--- a/media/libjpeg/simd/jcsample.h
+++ b/media/libjpeg/simd/mips64/jcsample.h
@@ -8,19 +8,19 @@
*/
LOCAL(void)
-expand_right_edge (JSAMPARRAY image_data, int num_rows,
- JDIMENSION input_cols, JDIMENSION output_cols)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+ JDIMENSION output_cols)
{
register JSAMPROW ptr;
register JSAMPLE pixval;
register int count;
int row;
- int numcols = (int) (output_cols - input_cols);
+ int numcols = (int)(output_cols - input_cols);
if (numcols > 0) {
for (row = 0; row < num_rows; row++) {
ptr = image_data[row] + input_cols;
- pixval = ptr[-1]; /* don't need GETJSAMPLE() here */
+ pixval = ptr[-1];
for (count = numcols; count > 0; count--)
*ptr++ = pixval;
}
diff --git a/media/libjpeg/simd/mips64/jdcolext-mmi.c b/media/libjpeg/simd/mips64/jdcolext-mmi.c
new file mode 100644
index 0000000000..3b5b2f2030
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolext-mmi.c
@@ -0,0 +1,415 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA re
+#define mmB ro
+#elif RGB_GREEN == 0
+#define mmA ge
+#define mmB go
+#elif RGB_BLUE == 0
+#define mmA be
+#define mmB bo
+#else
+#define mmA xe
+#define mmB xo
+#endif
+
+#if RGB_RED == 1
+#define mmC re
+#define mmD ro
+#elif RGB_GREEN == 1
+#define mmC ge
+#define mmD go
+#elif RGB_BLUE == 1
+#define mmC be
+#define mmD bo
+#else
+#define mmC xe
+#define mmD xo
+#endif
+
+#if RGB_RED == 2
+#define mmE re
+#define mmF ro
+#elif RGB_GREEN == 2
+#define mmE ge
+#define mmF go
+#elif RGB_BLUE == 2
+#define mmE be
+#define mmF bo
+#else
+#define mmE xe
+#define mmF xo
+#endif
+
+#if RGB_RED == 3
+#define mmG re
+#define mmH ro
+#elif RGB_GREEN == 3
+#define mmG ge
+#define mmH go
+#elif RGB_BLUE == 3
+#define mmG be
+#define mmH bo
+#else
+#define mmG xe
+#define mmH xo
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ int num_cols, col;
+ __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+ __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+ __m64 decenter, mask;
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+
+ for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+ inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+ cb = _mm_load_si64((__m64 *)inptr1);
+ cr = _mm_load_si64((__m64 *)inptr2);
+ y = _mm_load_si64((__m64 *)inptr0);
+
+ mask = decenter = 0.0;
+ mask = _mm_cmpeq_pi16(mask, mask);
+ decenter = _mm_cmpeq_pi16(decenter, decenter);
+ mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
+ decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+ cbe = _mm_and_si64(mask, cb); /* Cb(0246) */
+ cbo = _mm_srli_pi16(cb, BYTE_BIT); /* Cb(1357) */
+ cre = _mm_and_si64(mask, cr); /* Cr(0246) */
+ cro = _mm_srli_pi16(cr, BYTE_BIT); /* Cr(1357) */
+ cbe = _mm_add_pi16(cbe, decenter);
+ cbo = _mm_add_pi16(cbo, decenter);
+ cre = _mm_add_pi16(cre, decenter);
+ cro = _mm_add_pi16(cro, decenter);
+
+ /* (Original)
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * (This implementation)
+ * R = Y + 0.40200 * Cr + Cr
+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ * B = Y - 0.22800 * Cb + Cb + Cb
+ */
+
+ cbe2 = _mm_add_pi16(cbe, cbe); /* 2*CbE */
+ cbo2 = _mm_add_pi16(cbo, cbo); /* 2*CbO */
+ cre2 = _mm_add_pi16(cre, cre); /* 2*CrE */
+ cro2 = _mm_add_pi16(cro, cro); /* 2*CrO */
+
+ be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2*CbE * -FIX(0.22800) */
+ bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2*CbO * -FIX(0.22800) */
+ re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2*CrE * FIX(0.40200)) */
+ ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2*CrO * FIX(0.40200)) */
+
+ be = _mm_add_pi16(be, PW_ONE);
+ bo = _mm_add_pi16(bo, PW_ONE);
+ be = _mm_srai_pi16(be, 1); /* (CbE * -FIX(0.22800)) */
+ bo = _mm_srai_pi16(bo, 1); /* (CbO * -FIX(0.22800)) */
+ re = _mm_add_pi16(re, PW_ONE);
+ ro = _mm_add_pi16(ro, PW_ONE);
+ re = _mm_srai_pi16(re, 1); /* (CrE * FIX(0.40200)) */
+ ro = _mm_srai_pi16(ro, 1); /* (CrO * FIX(0.40200)) */
+
+ be = _mm_add_pi16(be, cbe);
+ bo = _mm_add_pi16(bo, cbo);
+ be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200))=(B-Y)E */
+ bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200))=(B-Y)O */
+ re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200))=(R-Y)E */
+ ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200))=(R-Y)O */
+
+ gle = _mm_unpacklo_pi16(cbe, cre);
+ ghe = _mm_unpackhi_pi16(cbe, cre);
+ gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+ ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+ glo = _mm_unpacklo_pi16(cbo, cro);
+ gho = _mm_unpackhi_pi16(cbo, cro);
+ glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+ gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+ gle = _mm_add_pi32(gle, PD_ONEHALF);
+ ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+ gle = _mm_srai_pi32(gle, SCALEBITS);
+ ghe = _mm_srai_pi32(ghe, SCALEBITS);
+ glo = _mm_add_pi32(glo, PD_ONEHALF);
+ gho = _mm_add_pi32(gho, PD_ONEHALF);
+ glo = _mm_srai_pi32(glo, SCALEBITS);
+ gho = _mm_srai_pi32(gho, SCALEBITS);
+
+ ge = _mm_packs_pi32(gle, ghe); /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+ go = _mm_packs_pi32(glo, gho); /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+ ge = _mm_sub_pi16(ge, cre); /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+ go = _mm_sub_pi16(go, cro); /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+ ye = _mm_and_si64(mask, y); /* Y(0246) */
+ yo = _mm_srli_pi16(y, BYTE_BIT); /* Y(1357) */
+
+ re = _mm_add_pi16(re, ye); /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+ ro = _mm_add_pi16(ro, yo); /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+ re = _mm_packs_pu16(re, re); /* (R0 R2 R4 R6 ** ** ** **) */
+ ro = _mm_packs_pu16(ro, ro); /* (R1 R3 R5 R7 ** ** ** **) */
+
+ ge = _mm_add_pi16(ge, ye); /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+ go = _mm_add_pi16(go, yo); /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+ ge = _mm_packs_pu16(ge, ge); /* (G0 G2 G4 G6 ** ** ** **) */
+ go = _mm_packs_pu16(go, go); /* (G1 G3 G5 G7 ** ** ** **) */
+
+ be = _mm_add_pi16(be, ye); /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+ bo = _mm_add_pi16(bo, yo); /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+ be = _mm_packs_pu16(be, be); /* (B0 B2 B4 B6 ** ** ** **) */
+ bo = _mm_packs_pu16(bo, bo); /* (B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+ /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+ /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+ mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mmE = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
+ mmD = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
+
+ mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
+
+ mmG = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 05 06 16 26 07) */
+ mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 01 02 12 22 03) */
+
+ mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+ mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (13 23 15 25 17 27 -- --) */
+
+ mmC = _mm_unpackhi_pi16(mmD, mmH); /* (15 25 06 16 17 27 -- --) */
+ mmD = _mm_unpacklo_pi16(mmD, mmH); /* (11 21 02 12 13 23 04 14) */
+
+ mmF = _mm_unpackhi_pi16(mmE, mmB); /* (26 07 17 27 -- -- -- --) */
+ mmE = _mm_unpacklo_pi16(mmE, mmB); /* (22 03 13 23 24 05 15 25) */
+
+ mmA = _mm_unpacklo_pi32(mmA, mmD); /* (00 10 20 01 11 21 02 12) */
+ mmE = _mm_unpacklo_pi32(mmE, mmG); /* (22 03 13 23 04 14 24 05) */
+ mmC = _mm_unpacklo_pi32(mmC, mmF); /* (15 25 06 16 26 07 17 27) */
+
+ if (num_cols >= 8) {
+ if (!(((long)outptr) & 7)) {
+ _mm_store_si64((__m64 *)outptr, mmA);
+ _mm_store_si64((__m64 *)(outptr + 8), mmE);
+ _mm_store_si64((__m64 *)(outptr + 16), mmC);
+ } else {
+ _mm_storeu_si64((__m64 *)outptr, mmA);
+ _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
+ _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+ }
+ outptr += RGB_PIXELSIZE * 8;
+ } else {
+ col = num_cols * 3;
+ asm(".set noreorder\r\n"
+
+ "li $8, 16\r\n"
+ "move $9, %4\r\n"
+ "mov.s $f4, %1\r\n"
+ "mov.s $f6, %3\r\n"
+ "move $10, %5\r\n"
+ "bltu $9, $8, 1f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "mov.s $f4, %2\r\n"
+ "subu $9, $9, 16\r\n"
+ PTR_ADDU "$10, $10, 16\r\n"
+ "b 2f\r\n"
+ "nop \r\n"
+
+ "1: \r\n"
+ "li $8, 8\r\n" /* st8 */
+ "bltu $9, $8, 2f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "mov.s $f4, %3\r\n"
+ "subu $9, $9, 8\r\n"
+ PTR_ADDU "$10, $10, 8\r\n"
+
+ "2: \r\n"
+ "li $8, 4\r\n" /* st4 */
+ "mfc1 $11, $f4\r\n"
+ "bltu $9, $8, 3f\r\n"
+ "nop \r\n"
+ "swl $11, 3($10)\r\n"
+ "swr $11, 0($10)\r\n"
+ "li $8, 32\r\n"
+ "mtc1 $8, $f6\r\n"
+ "dsrl $f4, $f4, $f6\r\n"
+ "mfc1 $11, $f4\r\n"
+ "subu $9, $9, 4\r\n"
+ PTR_ADDU "$10, $10, 4\r\n"
+
+ "3: \r\n"
+ "li $8, 2\r\n" /* st2 */
+ "bltu $9, $8, 4f\r\n"
+ "nop \r\n"
+ "ush $11, 0($10)\r\n"
+ "srl $11, 16\r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_ADDU "$10, $10, 2\r\n"
+
+ "4: \r\n"
+ "li $8, 1\r\n" /* st1 */
+ "bltu $9, $8, 5f\r\n"
+ "nop \r\n"
+ "sb $11, 0($10)\r\n"
+
+ "5: \r\n"
+ "nop \r\n" /* end */
+ : "=m" (*outptr)
+ : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+ : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+ );
+ }
+
+#else /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+ xe = _mm_cmpeq_pi8(xe, xe);
+ xo = _mm_cmpeq_pi8(xo, xo);
+#else
+ xe = _mm_xor_si64(xe, xe);
+ xo = _mm_xor_si64(xo, xo);
+#endif
+ /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+ /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+ /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+ /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+ mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mmE = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
+ mmB = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
+ mmF = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
+
+ mmC = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 34 06 16 26 36) */
+ mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 30 02 12 22 32) */
+ mmG = _mm_unpackhi_pi16(mmB, mmF); /* (05 15 25 35 07 17 27 37) */
+ mmB = _mm_unpacklo_pi16(mmB, mmF); /* (01 11 21 31 03 13 23 33) */
+
+ mmD = _mm_unpackhi_pi32(mmA, mmB); /* (02 12 22 32 03 13 23 33) */
+ mmA = _mm_unpacklo_pi32(mmA, mmB); /* (00 10 20 30 01 11 21 31) */
+ mmH = _mm_unpackhi_pi32(mmC, mmG); /* (06 16 26 36 07 17 27 37) */
+ mmC = _mm_unpacklo_pi32(mmC, mmG); /* (04 14 24 34 05 15 25 35) */
+
+ if (num_cols >= 8) {
+ if (!(((long)outptr) & 7)) {
+ _mm_store_si64((__m64 *)outptr, mmA);
+ _mm_store_si64((__m64 *)(outptr + 8), mmD);
+ _mm_store_si64((__m64 *)(outptr + 16), mmC);
+ _mm_store_si64((__m64 *)(outptr + 24), mmH);
+ } else {
+ _mm_storeu_si64((__m64 *)outptr, mmA);
+ _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
+ _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+ _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
+ }
+ outptr += RGB_PIXELSIZE * 8;
+ } else {
+ col = num_cols;
+ asm(".set noreorder\r\n" /* st16 */
+
+ "li $8, 4\r\n"
+ "move $9, %6\r\n"
+ "move $10, %7\r\n"
+ "mov.s $f4, %2\r\n"
+ "mov.s $f6, %4\r\n"
+ "bltu $9, $8, 1f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "mov.s $f4, %3\r\n"
+ "mov.s $f6, %5\r\n"
+ "subu $9, $9, 4\r\n"
+ PTR_ADDU "$10, $10, 16\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n" /* st8 */
+ "bltu $9, $8, 2f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "mov.s $f4, $f6\r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_ADDU "$10, $10, 8\r\n"
+
+ "2: \r\n"
+ "li $8, 1\r\n" /* st4 */
+ "bltu $9, $8, 3f\r\n"
+ "nop \r\n"
+ "gsswlc1 $f4, 3($10)\r\n"
+ "gsswrc1 $f4, 0($10)\r\n"
+
+ "3: \r\n"
+ "li %1, 0\r\n" /* end */
+ : "=m" (*outptr), "=r" (col)
+ : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+ "r" (outptr)
+ : "$f4", "$f6", "$8", "$9", "$10", "memory"
+ );
+ }
+
+#endif
+
+ }
+ }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdcolor-mmi.c b/media/libjpeg/simd/mips64/jdcolor-mmi.c
new file mode 100644
index 0000000000..2c58263dbd
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolor-mmi.c
@@ -0,0 +1,139 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344 ((short)22554) /* FIX(0.34414) */
+#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+ index_PW_ONE,
+ index_PW_F0402,
+ index_PW_MF0228,
+ index_PW_MF0344_F0285,
+ index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(1, 1, 1, 1),
+ _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+ _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+ _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+ _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE get_const_value(index_PW_ONE)
+#define PW_F0402 get_const_value(index_PW_F0402)
+#define PW_MF0228 get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF 1
+
+
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgbx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgrx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jdmerge-mmi.c b/media/libjpeg/simd/mips64/jdmerge-mmi.c
new file mode 100644
index 0000000000..0a39bd5680
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmerge-mmi.c
@@ -0,0 +1,149 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344 ((short)22554) /* FIX(0.34414) */
+#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+ index_PW_ONE,
+ index_PW_F0402,
+ index_PW_MF0228,
+ index_PW_MF0344_F0285,
+ index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(1, 1, 1, 1),
+ _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+ _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+ _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+ _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE get_const_value(index_PW_ONE)
+#define PW_F0402 get_const_value(index_PW_F0402)
+#define PW_MF0228 get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF 1
+
+
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extrgbx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extrgbx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extbgrx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extbgrx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extxbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extxbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extxrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extxrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
diff --git a/media/libjpeg/simd/mips64/jdmrgext-mmi.c b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
new file mode 100644
index 0000000000..be09ff2a65
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
@@ -0,0 +1,615 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA re
+#define mmB ro
+#elif RGB_GREEN == 0
+#define mmA ge
+#define mmB go
+#elif RGB_BLUE == 0
+#define mmA be
+#define mmB bo
+#else
+#define mmA xe
+#define mmB xo
+#endif
+
+#if RGB_RED == 1
+#define mmC re
+#define mmD ro
+#elif RGB_GREEN == 1
+#define mmC ge
+#define mmD go
+#elif RGB_BLUE == 1
+#define mmC be
+#define mmD bo
+#else
+#define mmC xe
+#define mmD xo
+#endif
+
+#if RGB_RED == 2
+#define mmE re
+#define mmF ro
+#elif RGB_GREEN == 2
+#define mmE ge
+#define mmF go
+#elif RGB_BLUE == 2
+#define mmE be
+#define mmF bo
+#else
+#define mmE xe
+#define mmF xo
+#endif
+
+#if RGB_RED == 3
+#define mmG re
+#define mmH ro
+#elif RGB_GREEN == 3
+#define mmG ge
+#define mmH go
+#elif RGB_BLUE == 3
+#define mmG be
+#define mmH bo
+#else
+#define mmG xe
+#define mmH xo
+#endif
+
+
+void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ int num_cols, col;
+ __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
+ __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
+ __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
+ __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
+ __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
+ __m64 decenter, mask, zero = 0.0;
+#if RGB_PIXELSIZE == 4
+ __m64 mm8, mm9;
+#endif
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
+ inptr0 += 16, inptr1 += 8, inptr2 += 8) {
+
+ cb = _mm_load_si64((__m64 *)inptr1);
+ cr = _mm_load_si64((__m64 *)inptr2);
+ ythis = _mm_load_si64((__m64 *)inptr0);
+ ynext = _mm_load_si64((__m64 *)inptr0 + 1);
+
+ mask = decenter = 0.0;
+ mask = _mm_cmpeq_pi16(mask, mask);
+ decenter = _mm_cmpeq_pi16(decenter, decenter);
+ mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
+ decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+ cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
+ cbh = _mm_unpackhi_pi8(cb, zero); /* Cb(4567) */
+ crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */
+ crh = _mm_unpackhi_pi8(cr, zero); /* Cr(4567) */
+ cbl = _mm_add_pi16(cbl, decenter);
+ cbh = _mm_add_pi16(cbh, decenter);
+ crl = _mm_add_pi16(crl, decenter);
+ crh = _mm_add_pi16(crh, decenter);
+
+ /* (Original)
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * (This implementation)
+ * R = Y + 0.40200 * Cr + Cr
+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ * B = Y - 0.22800 * Cb + Cb + Cb
+ */
+
+ cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
+ cbh2 = _mm_add_pi16(cbh, cbh); /* 2*CbH */
+ crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
+ crh2 = _mm_add_pi16(crh, crh); /* 2*CrH */
+
+ bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
+ bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2*CbH * -FIX(0.22800) */
+ rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
+ rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2*CrH * FIX(0.40200)) */
+
+ bl = _mm_add_pi16(bl, PW_ONE);
+ bh = _mm_add_pi16(bh, PW_ONE);
+ bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */
+ bh = _mm_srai_pi16(bh, 1); /* (CbH * -FIX(0.22800)) */
+ rl = _mm_add_pi16(rl, PW_ONE);
+ rh = _mm_add_pi16(rh, PW_ONE);
+ rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */
+ rh = _mm_srai_pi16(rh, 1); /* (CrH * FIX(0.40200)) */
+
+ bl = _mm_add_pi16(bl, cbl);
+ bh = _mm_add_pi16(bh, cbh);
+ bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
+ bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200))=(B-Y)H */
+ rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
+ rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200))=(R-Y)H */
+
+ ga = _mm_unpacklo_pi16(cbl, crl);
+ gb = _mm_unpackhi_pi16(cbl, crl);
+ ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
+ gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
+ gc = _mm_unpacklo_pi16(cbh, crh);
+ gd = _mm_unpackhi_pi16(cbh, crh);
+ gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
+ gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
+
+ ga = _mm_add_pi32(ga, PD_ONEHALF);
+ gb = _mm_add_pi32(gb, PD_ONEHALF);
+ ga = _mm_srai_pi32(ga, SCALEBITS);
+ gb = _mm_srai_pi32(gb, SCALEBITS);
+ gc = _mm_add_pi32(gc, PD_ONEHALF);
+ gd = _mm_add_pi32(gd, PD_ONEHALF);
+ gc = _mm_srai_pi32(gc, SCALEBITS);
+ gd = _mm_srai_pi32(gd, SCALEBITS);
+
+ gl = _mm_packs_pi32(ga, gb); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+ gh = _mm_packs_pi32(gc, gd); /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
+ gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+ gh = _mm_sub_pi16(gh, crh); /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
+
+ ythise = _mm_and_si64(mask, ythis); /* Y(0246) */
+ ythiso = _mm_srli_pi16(ythis, BYTE_BIT); /* Y(1357) */
+ ynexte = _mm_and_si64(mask, ynext); /* Y(8ACE) */
+ ynexto = _mm_srli_pi16(ynext, BYTE_BIT); /* Y(9BDF) */
+
+ rle = _mm_add_pi16(rl, ythise); /* (R0 R2 R4 R6) */
+ rlo = _mm_add_pi16(rl, ythiso); /* (R1 R3 R5 R7) */
+ rhe = _mm_add_pi16(rh, ynexte); /* (R8 RA RC RE) */
+ rho = _mm_add_pi16(rh, ynexto); /* (R9 RB RD RF) */
+ re = _mm_packs_pu16(rle, rhe); /* (R0 R2 R4 R6 R8 RA RC RE) */
+ ro = _mm_packs_pu16(rlo, rho); /* (R1 R3 R5 R7 R9 RB RD RF) */
+
+ gle = _mm_add_pi16(gl, ythise); /* (G0 G2 G4 G6) */
+ glo = _mm_add_pi16(gl, ythiso); /* (G1 G3 G5 G7) */
+ ghe = _mm_add_pi16(gh, ynexte); /* (G8 GA GC GE) */
+ gho = _mm_add_pi16(gh, ynexto); /* (G9 GB GD GF) */
+ ge = _mm_packs_pu16(gle, ghe); /* (G0 G2 G4 G6 G8 GA GC GE) */
+ go = _mm_packs_pu16(glo, gho); /* (G1 G3 G5 G7 G9 GB GD GF) */
+
+ ble = _mm_add_pi16(bl, ythise); /* (B0 B2 B4 B6) */
+ blo = _mm_add_pi16(bl, ythiso); /* (B1 B3 B5 B7) */
+ bhe = _mm_add_pi16(bh, ynexte); /* (B8 BA BC BE) */
+ bho = _mm_add_pi16(bh, ynexto); /* (B9 BB BD BF) */
+ be = _mm_packs_pu16(ble, bhe); /* (B0 B2 B4 B6 B8 BA BC BE) */
+ bo = _mm_packs_pu16(blo, bho); /* (B1 B3 B5 B7 B9 BB BD BF) */
+
+#if RGB_PIXELSIZE == 3
+
+ /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+ /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+ /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+ mmG = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mmA = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
+ mmH = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
+ mmE = _mm_unpackhi_pi8(mmE, mmB); /* (28 09 2A 0B 2C 0D 2E 0F) */
+ mmC = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
+ mmD = _mm_unpackhi_pi8(mmD, mmF); /* (19 29 1B 2B 1D 2D 1F 2F) */
+
+ mmB = _mm_unpacklo_pi16(mmG, mmA); /* (00 10 08 18 02 12 0A 1A) */
+ mmA = _mm_unpackhi_pi16(mmG, mmA); /* (04 14 0C 1C 06 16 0E 1E) */
+ mmF = _mm_unpacklo_pi16(mmH, mmE); /* (20 01 28 09 22 03 2A 0B) */
+ mmE = _mm_unpackhi_pi16(mmH, mmE); /* (24 05 2C 0D 26 07 2E 0F) */
+ mmH = _mm_unpacklo_pi16(mmC, mmD); /* (11 21 19 29 13 23 1B 2B) */
+ mmG = _mm_unpackhi_pi16(mmC, mmD); /* (15 25 1D 2D 17 27 1F 2F) */
+
+ mmC = _mm_unpacklo_pi16(mmB, mmF); /* (00 10 20 01 08 18 28 09) */
+ mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
+ mmB = _mm_unpacklo_pi16(mmH, mmB); /* (11 21 02 12 19 29 0A 1A) */
+ mmD = _mm_unpackhi_pi16(mmF, mmH); /* (22 03 13 23 2A 0B 1B 2B) */
+ mmF = _mm_unpacklo_pi16(mmA, mmE); /* (04 14 24 05 0C 1C 2C 0D) */
+ mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmH = _mm_unpacklo_pi16(mmG, mmA); /* (15 25 06 16 1D 2D 0E 1E) */
+ mmG = _mm_unpackhi_pi16(mmE, mmG); /* (26 07 17 27 2E 0F 1F 2F) */
+
+ mmA = _mm_unpacklo_pi32(mmC, mmB); /* (00 10 20 01 11 21 02 12) */
+ mmE = _mm_unpackhi_pi32(mmC, mmB); /* (08 18 28 09 19 29 0A 1A) */
+ mmB = _mm_unpacklo_pi32(mmD, mmF); /* (22 03 13 23 04 14 24 05) */
+ mmF = _mm_unpackhi_pi32(mmD, mmF); /* (2A 0B 1B 2B 0C 1C 2C 0D) */
+ mmC = _mm_unpacklo_pi32(mmH, mmG); /* (15 25 06 16 26 07 17 27) */
+ mmG = _mm_unpackhi_pi32(mmH, mmG); /* (1D 2D 0E 1E 2E 0F 1F 2F) */
+
+ if (num_cols >= 8) {
+ if (!(((long)outptr) & 7)) {
+ _mm_store_si64((__m64 *)outptr, mmA);
+ _mm_store_si64((__m64 *)(outptr + 8), mmB);
+ _mm_store_si64((__m64 *)(outptr + 16), mmC);
+ _mm_store_si64((__m64 *)(outptr + 24), mmE);
+ _mm_store_si64((__m64 *)(outptr + 32), mmF);
+ _mm_store_si64((__m64 *)(outptr + 40), mmG);
+ } else {
+ _mm_storeu_si64((__m64 *)outptr, mmA);
+ _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+ _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+ _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
+ _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
+ _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
+ }
+ outptr += RGB_PIXELSIZE * 16;
+ } else {
+ if (output_width & 1)
+ col = num_cols * 6 + 3;
+ else
+ col = num_cols * 6;
+
+ asm(".set noreorder\r\n" /* st24 */
+
+ "li $8, 24\r\n"
+ "move $9, %7\r\n"
+ "mov.s $f4, %1\r\n"
+ "mov.s $f6, %2\r\n"
+ "mov.s $f8, %3\r\n"
+ "move $10, %8\r\n"
+ "bltu $9, $8, 1f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "gssdlc1 $f8, 7+16($10)\r\n"
+ "gssdrc1 $f8, 16($10)\r\n"
+ "mov.s $f4, %4\r\n"
+ "mov.s $f6, %5\r\n"
+ "mov.s $f8, %6\r\n"
+ "subu $9, $9, 24\r\n"
+ PTR_ADDU "$10, $10, 24\r\n"
+
+ "1: \r\n"
+ "li $8, 16\r\n" /* st16 */
+ "bltu $9, $8, 2f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "mov.s $f4, $f8\r\n"
+ "subu $9, $9, 16\r\n"
+ PTR_ADDU "$10, $10, 16\r\n"
+
+ "2: \r\n"
+ "li $8, 8\r\n" /* st8 */
+ "bltu $9, $8, 3f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "mov.s $f4, $f6\r\n"
+ "subu $9, $9, 8\r\n"
+ PTR_ADDU "$10, $10, 8\r\n"
+
+ "3: \r\n"
+ "li $8, 4\r\n" /* st4 */
+ "mfc1 $11, $f4\r\n"
+ "bltu $9, $8, 4f\r\n"
+ "nop \r\n"
+ "swl $11, 3($10)\r\n"
+ "swr $11, 0($10)\r\n"
+ "li $8, 32\r\n"
+ "mtc1 $8, $f6\r\n"
+ "dsrl $f4, $f4, $f6\r\n"
+ "mfc1 $11, $f4\r\n"
+ "subu $9, $9, 4\r\n"
+ PTR_ADDU "$10, $10, 4\r\n"
+
+ "4: \r\n"
+ "li $8, 2\r\n" /* st2 */
+ "bltu $9, $8, 5f\r\n"
+ "nop \r\n"
+ "ush $11, 0($10)\r\n"
+ "srl $11, 16\r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_ADDU "$10, $10, 2\r\n"
+
+ "5: \r\n"
+ "li $8, 1\r\n" /* st1 */
+ "bltu $9, $8, 6f\r\n"
+ "nop \r\n"
+ "sb $11, 0($10)\r\n"
+
+ "6: \r\n"
+ "nop \r\n" /* end */
+ : "=m" (*outptr)
+ : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
+ "f" (mmG), "r" (col), "r" (outptr)
+ : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
+ );
+ }
+
+#else /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+ xe = _mm_cmpeq_pi8(xe, xe);
+ xo = _mm_cmpeq_pi8(xo, xo);
+#else
+ xe = _mm_xor_si64(xe, xe);
+ xo = _mm_xor_si64(xo, xo);
+#endif
+ /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+ /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+ /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+ /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
+
+ mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
+ mmA = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
+ mmE = _mm_unpackhi_pi8(mmE, mmG); /* (28 38 2A 3A 2C 3C 2E 3E) */
+
+ mmG = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
+ mmB = _mm_unpackhi_pi8(mmB, mmD); /* (09 19 0B 1B 0D 1D 0F 1F) */
+ mmD = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
+ mmF = _mm_unpackhi_pi8(mmF, mmH); /* (29 39 2B 3B 2D 3D 2F 3F) */
+
+ mmH = _mm_unpacklo_pi16(mm8, mmA); /* (00 10 20 30 02 12 22 32) */
+ mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (04 14 24 34 06 16 26 36) */
+ mmA = _mm_unpacklo_pi16(mmG, mmD); /* (01 11 21 31 03 13 23 33) */
+ mmD = _mm_unpackhi_pi16(mmG, mmD); /* (05 15 25 35 07 17 27 37) */
+
+ mmG = _mm_unpackhi_pi16(mm9, mmE); /* (0C 1C 2C 3C 0E 1E 2E 3E) */
+ mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (08 18 28 38 0A 1A 2A 3A) */
+ mmE = _mm_unpacklo_pi16(mmB, mmF); /* (09 19 29 39 0B 1B 2B 3B) */
+ mmF = _mm_unpackhi_pi16(mmB, mmF); /* (0D 1D 2D 3D 0F 1F 2F 3F) */
+
+ mmB = _mm_unpackhi_pi32(mmH, mmA); /* (02 12 22 32 03 13 23 33) */
+ mmA = _mm_unpacklo_pi32(mmH, mmA); /* (00 10 20 30 01 11 21 31) */
+ mmC = _mm_unpacklo_pi32(mm8, mmD); /* (04 14 24 34 05 15 25 35) */
+ mmD = _mm_unpackhi_pi32(mm8, mmD); /* (06 16 26 36 07 17 27 37) */
+
+ mmH = _mm_unpackhi_pi32(mmG, mmF); /* (0E 1E 2E 3E 0F 1F 2F 3F) */
+ mmG = _mm_unpacklo_pi32(mmG, mmF); /* (0C 1C 2C 3C 0D 1D 2D 3D) */
+ mmF = _mm_unpackhi_pi32(mm9, mmE); /* (0A 1A 2A 3A 0B 1B 2B 3B) */
+ mmE = _mm_unpacklo_pi32(mm9, mmE); /* (08 18 28 38 09 19 29 39) */
+
+ if (num_cols >= 8) {
+ if (!(((long)outptr) & 7)) {
+ _mm_store_si64((__m64 *)outptr, mmA);
+ _mm_store_si64((__m64 *)(outptr + 8), mmB);
+ _mm_store_si64((__m64 *)(outptr + 16), mmC);
+ _mm_store_si64((__m64 *)(outptr + 24), mmD);
+ _mm_store_si64((__m64 *)(outptr + 32), mmE);
+ _mm_store_si64((__m64 *)(outptr + 40), mmF);
+ _mm_store_si64((__m64 *)(outptr + 48), mmG);
+ _mm_store_si64((__m64 *)(outptr + 56), mmH);
+ } else {
+ _mm_storeu_si64((__m64 *)outptr, mmA);
+ _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+ _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+ _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
+ _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
+ _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
+ _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
+ _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
+ }
+ outptr += RGB_PIXELSIZE * 16;
+ } else {
+ if (output_width & 1)
+ col = num_cols * 2 + 1;
+ else
+ col = num_cols * 2;
+ asm(".set noreorder\r\n" /* st32 */
+
+ "li $8, 8\r\n"
+ "move $9, %10\r\n"
+ "move $10, %11\r\n"
+ "mov.s $f4, %2\r\n"
+ "mov.s $f6, %3\r\n"
+ "mov.s $f8, %4\r\n"
+ "mov.s $f10, %5\r\n"
+ "bltu $9, $8, 1f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "gssdlc1 $f8, 7+16($10)\r\n"
+ "gssdrc1 $f8, 16($10)\r\n"
+ "gssdlc1 $f10, 7+24($10)\r\n"
+ "gssdrc1 $f10, 24($10)\r\n"
+ "mov.s $f4, %6\r\n"
+ "mov.s $f6, %7\r\n"
+ "mov.s $f8, %8\r\n"
+ "mov.s $f10, %9\r\n"
+ "subu $9, $9, 8\r\n"
+ PTR_ADDU "$10, $10, 32\r\n"
+
+ "1: \r\n"
+ "li $8, 4\r\n" /* st16 */
+ "bltu $9, $8, 2f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "mov.s $f4, $f8\r\n"
+ "mov.s $f6, $f10\r\n"
+ "subu $9, $9, 4\r\n"
+ PTR_ADDU "$10, $10, 16\r\n"
+
+ "2: \r\n"
+ "li $8, 2\r\n" /* st8 */
+ "bltu $9, $8, 3f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "mov.s $f4, $f6\r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_ADDU "$10, $10, 8\r\n"
+
+ "3: \r\n"
+ "li $8, 1\r\n" /* st4 */
+ "bltu $9, $8, 4f\r\n"
+ "nop \r\n"
+ "gsswlc1 $f4, 3($10)\r\n"
+ "gsswrc1 $f4, 0($10)\r\n"
+
+ "4: \r\n"
+ "li %1, 0\r\n" /* end */
+ : "=m" (*outptr), "=r" (col)
+ : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
+ "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
+ : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
+ );
+ }
+
+#endif
+
+ }
+
+ if (!((output_width >> 1) & 7)) {
+ if (output_width & 1) {
+ cb = _mm_load_si64((__m64 *)inptr1);
+ cr = _mm_load_si64((__m64 *)inptr2);
+ y = _mm_load_si64((__m64 *)inptr0);
+
+ decenter = 0.0;
+ decenter = _mm_cmpeq_pi16(decenter, decenter);
+ decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+ cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
+ crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */
+ cbl = _mm_add_pi16(cbl, decenter);
+ crl = _mm_add_pi16(crl, decenter);
+
+ cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
+ crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
+ bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
+ rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
+
+ bl = _mm_add_pi16(bl, PW_ONE);
+ bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */
+ rl = _mm_add_pi16(rl, PW_ONE);
+ rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */
+
+ bl = _mm_add_pi16(bl, cbl);
+ bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
+ rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
+
+ gl = _mm_unpacklo_pi16(cbl, crl);
+ gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
+ gl = _mm_add_pi32(gl, PD_ONEHALF);
+ gl = _mm_srai_pi32(gl, SCALEBITS);
+ gl = _mm_packs_pi32(gl, zero); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+ gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+
+ yl = _mm_unpacklo_pi8(y, zero); /* Y(0123) */
+ rl = _mm_add_pi16(rl, yl); /* (R0 R1 R2 R3) */
+ gl = _mm_add_pi16(gl, yl); /* (G0 G1 G2 G3) */
+ bl = _mm_add_pi16(bl, yl); /* (B0 B1 B2 B3) */
+ re = _mm_packs_pu16(rl, rl);
+ ge = _mm_packs_pu16(gl, gl);
+ be = _mm_packs_pu16(bl, bl);
+#if RGB_PIXELSIZE == 3
+ mmA = _mm_unpacklo_pi8(mmA, mmC);
+ mmA = _mm_unpacklo_pi16(mmA, mmE);
+ asm(".set noreorder\r\n"
+
+ "move $8, %2\r\n"
+ "mov.s $f4, %1\r\n"
+ "mfc1 $9, $f4\r\n"
+ "ush $9, 0($8)\r\n"
+ "srl $9, 16\r\n"
+ "sb $9, 2($8)\r\n"
+ : "=m" (*outptr)
+ : "f" (mmA), "r" (outptr)
+ : "$f4", "$8", "$9", "memory"
+ );
+#else /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+ xe = _mm_cmpeq_pi8(xe, xe);
+#else
+ xe = _mm_xor_si64(xe, xe);
+#endif
+ mmA = _mm_unpacklo_pi8(mmA, mmC);
+ mmE = _mm_unpacklo_pi8(mmE, mmG);
+ mmA = _mm_unpacklo_pi16(mmA, mmE);
+ asm(".set noreorder\r\n"
+
+ "move $8, %2\r\n"
+ "mov.s $f4, %1\r\n"
+ "gsswlc1 $f4, 3($8)\r\n"
+ "gsswrc1 $f4, 0($8)\r\n"
+ : "=m" (*outptr)
+ : "f" (mmA), "r" (outptr)
+ : "$f4", "$8", "memory"
+ );
+#endif
+ }
+ }
+}
+
+
+void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW inptr, outptr;
+
+ inptr = input_buf[0][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+ jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+ output_buf);
+
+ input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+ output_buf[0] = output_buf[1];
+ jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+ output_buf);
+
+ input_buf[0][in_row_group_ctr] = inptr;
+ output_buf[0] = outptr;
+}
+
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdsample-mmi.c b/media/libjpeg/simd/mips64/jdsample-mmi.c
new file mode 100644
index 0000000000..8ae94e7dcf
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdsample-mmi.c
@@ -0,0 +1,304 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+ index_PW_ONE,
+ index_PW_TWO,
+ index_PW_THREE,
+ index_PW_SEVEN,
+ index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(1, 1, 1, 1),
+ _uint64_set_pi16(2, 2, 2, 2),
+ _uint64_set_pi16(3, 3, 3, 3),
+ _uint64_set_pi16(7, 7, 7, 7),
+ _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_ONE get_const_value(index_PW_ONE)
+#define PW_TWO get_const_value(index_PW_TWO)
+#define PW_THREE get_const_value(index_PW_THREE)
+#define PW_SEVEN get_const_value(index_PW_SEVEN)
+#define PW_EIGHT get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
+ __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+ __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+ __m64 outle, outhe, outlo, outho, outl, outh; \
+ \
+ samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT); /* ( 1 2 3 -) */ \
+ sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 4) */ \
+ samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 3 - - -) */ \
+ sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT); /* ( - 4 5 6) */ \
+ \
+ samp1234 = _mm_or_si64(samp123X, sampXXX4); /* ( 1 2 3 4) */ \
+ samp3456 = _mm_or_si64(samp3XXX, sampX456); /* ( 3 4 5 6) */ \
+ \
+ sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT); /* ( - 0 1 2) */ \
+ samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT); /* ( 5 6 7 -) */ \
+ samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 7 - - -) */ \
+ \
+ samp_1012 = _mm_or_si64(sampX012, wk[row]); /* (-1 0 1 2) */ \
+ samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]); /* ( 5 6 7 8) */ \
+ \
+ wk[row] = samp7XXX; \
+ \
+ samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+ samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+ samp_1012 = _mm_add_pi16(samp_1012, bias1); \
+ samp3456 = _mm_add_pi16(samp3456, bias1); \
+ samp1234 = _mm_add_pi16(samp1234, bias2); \
+ samp5678 = _mm_add_pi16(samp5678, bias2); \
+ \
+ outle = _mm_add_pi16(samp_1012, samp0123); \
+ outhe = _mm_add_pi16(samp3456, samp4567); \
+ outle = _mm_srli_pi16(outle, shift); /* ( 0 2 4 6) */ \
+ outhe = _mm_srli_pi16(outhe, shift); /* ( 8 10 12 14) */ \
+ outlo = _mm_add_pi16(samp1234, samp0123); \
+ outho = _mm_add_pi16(samp5678, samp4567); \
+ outlo = _mm_srli_pi16(outlo, shift); /* ( 1 3 5 7) */ \
+ outho = _mm_srli_pi16(outho, shift); /* ( 9 11 13 15) */ \
+ \
+ outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+ outho = _mm_slli_pi16(outho, BYTE_BIT); \
+ outl = _mm_or_si64(outle, outlo); /* ( 0 1 2 3 4 5 6 7) */ \
+ outh = _mm_or_si64(outhe, outho); /* ( 8 9 10 11 12 13 14 15) */ \
+ \
+ _mm_store_si64((__m64 *)outptr##row, outl); \
+ _mm_store_si64((__m64 *)outptr##row + 1, outh); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+ int inrow, outrow, incol, tmp, tmp1;
+ __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+ __m64 this0l, this0h, this0;
+ __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+ __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+ __m64 next0l, next0h, next0;
+ __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+ __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
+
+ mask0 = _mm_cmpeq_pi8(mask0, mask0);
+ masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+ mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+ inptr_1 = input_data[inrow - 1];
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ if (downsampled_width & 7) {
+ tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+ tmp1 = downsampled_width * sizeof(JSAMPLE);
+ asm(PTR_ADDU "$8, %3, %6\r\n"
+ "lb $9, ($8)\r\n"
+ PTR_ADDU "$8, %3, %7\r\n"
+ "sb $9, ($8)\r\n"
+ PTR_ADDU "$8, %4, %6\r\n"
+ "lb $9, ($8)\r\n"
+ PTR_ADDU "$8, %4, %7\r\n"
+ "sb $9, ($8)\r\n"
+ PTR_ADDU "$8, %5, %6\r\n"
+ "lb $9, ($8)\r\n"
+ PTR_ADDU "$8, %5, %7\r\n"
+ "sb $9, ($8)\r\n"
+ : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+ : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+ : "$8", "$9"
+ );
+ }
+
+ /* process the first column block */
+ this0 = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */
+ this_1 = _mm_load_si64((__m64 *)inptr_1); /* row[-1][0] */
+ this1 = _mm_load_si64((__m64 *)inptr1); /* row[ 1][0] */
+
+ this0l = _mm_unpacklo_pi8(this0, zero); /* row[ 0][0]( 0 1 2 3) */
+ this0h = _mm_unpackhi_pi8(this0, zero); /* row[ 0][0]( 4 5 6 7) */
+ this_1l = _mm_unpacklo_pi8(this_1, zero); /* row[-1][0]( 0 1 2 3) */
+ this_1h = _mm_unpackhi_pi8(this_1, zero); /* row[-1][0]( 4 5 6 7) */
+ this1l = _mm_unpacklo_pi8(this1, zero); /* row[+1][0]( 0 1 2 3) */
+ this1h = _mm_unpackhi_pi8(this1, zero); /* row[+1][0]( 4 5 6 7) */
+
+ this0l = _mm_mullo_pi16(this0l, PW_THREE);
+ this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+ thiscolsum_1l = _mm_add_pi16(this_1l, this0l); /* ( 0 1 2 3) */
+ thiscolsum_1h = _mm_add_pi16(this_1h, this0h); /* ( 4 5 6 7) */
+ thiscolsum1l = _mm_add_pi16(this0l, this1l); /* ( 0 1 2 3) */
+ thiscolsum1h = _mm_add_pi16(this0h, this1h); /* ( 4 5 6 7) */
+
+ /* temporarily save the intermediate data */
+ _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+ _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+ _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+ _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+ wk[0] = _mm_and_si64(thiscolsum_1l, mask0); /* ( 0 - - -) */
+ wk[1] = _mm_and_si64(thiscolsum1l, mask0); /* ( 0 - - -) */
+
+ for (incol = downsampled_width; incol > 0;
+ incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+ outptr0 += 16, outptr1 += 16) {
+
+ if (incol > 8) {
+ /* process the next column block */
+ next0 = _mm_load_si64((__m64 *)inptr0 + 1); /* row[ 0][1] */
+ next_1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* row[-1][1] */
+ next1 = _mm_load_si64((__m64 *)inptr1 + 1); /* row[+1][1] */
+
+ next0l = _mm_unpacklo_pi8(next0, zero); /* row[ 0][1]( 0 1 2 3) */
+ next0h = _mm_unpackhi_pi8(next0, zero); /* row[ 0][1]( 4 5 6 7) */
+ next_1l = _mm_unpacklo_pi8(next_1, zero); /* row[-1][1]( 0 1 2 3) */
+ next_1h = _mm_unpackhi_pi8(next_1, zero); /* row[-1][1]( 4 5 6 7) */
+ next1l = _mm_unpacklo_pi8(next1, zero); /* row[+1][1]( 0 1 2 3) */
+ next1h = _mm_unpackhi_pi8(next1, zero); /* row[+1][1]( 4 5 6 7) */
+
+ next0l = _mm_mullo_pi16(next0l, PW_THREE);
+ next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+ nextcolsum_1l = _mm_add_pi16(next_1l, next0l); /* ( 0 1 2 3) */
+ nextcolsum_1h = _mm_add_pi16(next_1h, next0h); /* ( 4 5 6 7) */
+ nextcolsum1l = _mm_add_pi16(next0l, next1l); /* ( 0 1 2 3) */
+ nextcolsum1h = _mm_add_pi16(next0h, next1h); /* ( 4 5 6 7) */
+
+ /* temporarily save the intermediate data */
+ _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+ _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+ _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+ _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+ wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */
+ wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */
+ } else {
+ __m64 tmp;
+
+ /* process the last column block */
+ tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+ wk[2] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */
+ tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+ wk[3] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */
+ }
+
+ /* process the upper row */
+ samp0123 = _mm_load_si64((__m64 *)outptr0); /* ( 0 1 2 3) */ \
+ samp4567 = _mm_load_si64((__m64 *)outptr0 + 1); /* ( 4 5 6 7) */ \
+ PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
+
+ /* process the lower row */
+ samp0123 = _mm_load_si64((__m64 *)outptr1); /* ( 0 1 2 3) */ \
+ samp4567 = _mm_load_si64((__m64 *)outptr1 + 1); /* ( 4 5 6 7) */ \
+ PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
+ }
+ }
+}
+
+
+void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, outptr0;
+ int inrow, incol, tmp, tmp1;
+ __m64 thisl, this, nextl, next;
+ __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
+
+ mask0 = _mm_cmpeq_pi8(mask0, mask0);
+ masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+ mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+
+ inptr0 = input_data[inrow];
+ outptr0 = output_data[inrow];
+
+ if (downsampled_width & 7) {
+ tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+ tmp1 = downsampled_width * sizeof(JSAMPLE);
+ asm(PTR_ADDU "$8, %1, %2\r\n"
+ "lb $9, ($8)\r\n"
+ PTR_ADDU "$8, %1, %3\r\n"
+ "sb $9, ($8)\r\n"
+ : "=m" (*inptr0)
+ : "r" (inptr0), "r" (tmp), "r" (tmp1)
+ : "$8", "$9"
+ );
+ }
+
+ /* process the first column block */
+ this = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */
+ thisl = _mm_unpacklo_pi8(this, zero); /* row[ 0][0]( 0 1 2 3) */
+ wk[0] = _mm_and_si64(thisl, mask0); /* ( 0 - - -) */
+
+ for (incol = downsampled_width; incol > 0;
+ incol -= 8, inptr0 += 8, outptr0 += 16) {
+
+ if (incol > 8) {
+ /* process the next column block */
+ next = _mm_load_si64((__m64 *)inptr0 + 1); /* row[ 0][1] */
+ nextl = _mm_unpacklo_pi8(next, zero); /* row[ 0][1]( 0 1 2 3) */
+ wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */
+ } else {
+ __m64 thish;
+
+ /* process the last column block */
+ this = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */
+ thish = _mm_unpackhi_pi8(this, zero); /* row[ 0][1]( 4 5 6 7) */
+ wk[1] = _mm_and_si64(masklast, thish); /* ( - - - 7) */
+ }
+
+ /* process the row */
+ this = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */
+ samp0123 = _mm_unpacklo_pi8(this, zero); /* ( 0 1 2 3) */
+ samp4567 = _mm_unpackhi_pi8(this, zero); /* ( 4 5 6 7) */
+ PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
+ }
+ }
+}
diff --git a/media/libjpeg/simd/mips64/jfdctfst-mmi.c b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
new file mode 100644
index 0000000000..f7caf09a88
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
@@ -0,0 +1,255 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018-2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 8
+
+#define F_0_382 ((short)98) /* FIX(0.382683433) */
+#define F_0_541 ((short)139) /* FIX(0.541196100) */
+#define F_0_707 ((short)181) /* FIX(0.707106781) */
+#define F_1_306 ((short)334) /* FIX(1.306562965) */
+
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+ index_PW_F0707,
+ index_PW_F0382,
+ index_PW_F0541,
+ index_PW_F1306
+};
+
+static uint64_t const_value[] = {
+ _uint64_set1_pi16(F_0_707),
+ _uint64_set1_pi16(F_0_382),
+ _uint64_set1_pi16(F_0_541),
+ _uint64_set1_pi16(F_1_306)
+};
+
+#define PW_F0707 get_const_value(index_PW_F0707)
+#define PW_F0382 get_const_value(index_PW_F0382)
+#define PW_F0541 get_const_value(index_PW_F0541)
+#define PW_F1306 get_const_value(index_PW_F1306)
+
+
+#define DO_FDCT_MULTIPLY(out, in, multiplier) { \
+ __m64 mulhi, mullo, mul12, mul34; \
+ \
+ mullo = _mm_mullo_pi16(in, multiplier); \
+ mulhi = _mm_mulhi_pi16(in, multiplier); \
+ mul12 = _mm_unpacklo_pi16(mullo, mulhi); \
+ mul34 = _mm_unpackhi_pi16(mullo, mulhi); \
+ mul12 = _mm_srai_pi32(mul12, CONST_BITS); \
+ mul34 = _mm_srai_pi32(mul34, CONST_BITS); \
+ out = _mm_packs_pi32(mul12, mul34); \
+}
+
+#define DO_FDCT_COMMON() { \
+ \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_pi16(tmp0, tmp3); \
+ tmp13 = _mm_sub_pi16(tmp0, tmp3); \
+ tmp11 = _mm_add_pi16(tmp1, tmp2); \
+ tmp12 = _mm_sub_pi16(tmp1, tmp2); \
+ \
+ out0 = _mm_add_pi16(tmp10, tmp11); \
+ out4 = _mm_sub_pi16(tmp10, tmp11); \
+ \
+ z1 = _mm_add_pi16(tmp12, tmp13); \
+ DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \
+ \
+ out2 = _mm_add_pi16(tmp13, z1); \
+ out6 = _mm_sub_pi16(tmp13, z1); \
+ \
+ /* Odd part */ \
+ \
+ tmp10 = _mm_add_pi16(tmp4, tmp5); \
+ tmp11 = _mm_add_pi16(tmp5, tmp6); \
+ tmp12 = _mm_add_pi16(tmp6, tmp7); \
+ \
+ z5 = _mm_sub_pi16(tmp10, tmp12); \
+ DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \
+ \
+ DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \
+ z2 = _mm_add_pi16(z2, z5); \
+ \
+ DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \
+ z4 = _mm_add_pi16(z4, z5); \
+ \
+ DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \
+ \
+ z11 = _mm_add_pi16(tmp7, z3); \
+ z13 = _mm_sub_pi16(tmp7, z3); \
+ \
+ out5 = _mm_add_pi16(z13, z2); \
+ out3 = _mm_sub_pi16(z13, z2); \
+ out1 = _mm_add_pi16(z11, z4); \
+ out7 = _mm_sub_pi16(z11, z4); \
+}
+
+#define DO_FDCT_PASS1() { \
+ __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+ __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+ __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+ \
+ row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
+ row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+ row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
+ row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+ row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
+ row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+ row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
+ row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
+ row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
+ row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
+ row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
+ \
+ row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
+ row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
+ row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
+ row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
+ \
+ col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
+ col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
+ col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
+ col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
+ \
+ tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
+ tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
+ tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
+ tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
+ \
+ col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
+ col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
+ col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
+ col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
+ \
+ tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
+ tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
+ tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
+ tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
+ \
+ DO_FDCT_COMMON() \
+ \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+ __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+ __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+ __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+ \
+ col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
+ col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
+ col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
+ col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
+ col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \
+ col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \
+ col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \
+ col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
+ col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
+ col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
+ col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
+ \
+ col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
+ col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
+ col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
+ col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
+ \
+ row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
+ row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
+ row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
+ row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
+ \
+ tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
+ tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
+ tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
+ tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
+ \
+ row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
+ row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
+ row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
+ row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
+ \
+ tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
+ tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
+ tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
+ tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
+ \
+ DO_FDCT_COMMON() \
+ \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_ifast_mmi(DCTELEM *data)
+{
+ __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+ __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+ DCTELEM *dataptr = data;
+
+ /* Pass 1: process rows. */
+
+ DO_FDCT_PASS1()
+ dataptr += DCTSIZE * 4;
+ DO_FDCT_PASS1()
+
+ /* Pass 2: process columns. */
+
+ dataptr = data;
+ DO_FDCT_PASS2()
+ dataptr += 4;
+ DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jfdctint-mmi.c b/media/libjpeg/simd/mips64/jfdctint-mmi.c
new file mode 100644
index 0000000000..7f4dfe9123
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctint-mmi.c
@@ -0,0 +1,398 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018, 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
+#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
+#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
+#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
+#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
+#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
+#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
+#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
+#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
+#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
+#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
+#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
+
+enum const_index {
+ index_PW_F130_F054,
+ index_PW_F054_MF130,
+ index_PW_MF078_F117,
+ index_PW_F117_F078,
+ index_PW_MF060_MF089,
+ index_PW_MF089_F060,
+ index_PW_MF050_MF256,
+ index_PW_MF256_F050,
+ index_PD_DESCALE_P1,
+ index_PD_DESCALE_P2,
+ index_PW_DESCALE_P2X
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+ FIX_0_541, (FIX_0_541 + FIX_0_765)),
+ _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+ (FIX_0_541 - FIX_1_847), FIX_0_541),
+ _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+ FIX_1_175, (FIX_1_175 - FIX_1_961)),
+ _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+ (FIX_1_175 - FIX_0_390), FIX_1_175),
+ _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+ -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+ _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+ (FIX_1_501 - FIX_0_899), -FIX_0_899),
+ _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+ -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+ _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+ (FIX_3_072 - FIX_2_562), -FIX_2_562),
+ _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+ _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+ _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)),
+ (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)))
+};
+
+#define PW_F130_F054 get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130 get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117 get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078 get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060 get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050 get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2)
+#define PW_DESCALE_P2X get_const_value(index_PW_DESCALE_P2X)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+ __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \
+ __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \
+ __m64 out1l, out1h, out2l, out2h, out3l, out3h; \
+ __m64 out5l, out5h, out6l, out6h, out7l, out7h; \
+ __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+ \
+ /* (Original) \
+ * z1 = (tmp12 + tmp13) * 0.541196100; \
+ * out2 = z1 + tmp13 * 0.765366865; \
+ * out6 = z1 + tmp12 * -1.847759065; \
+ * \
+ * (This implementation) \
+ * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+ * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+ */ \
+ \
+ tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \
+ tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \
+ \
+ out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \
+ out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \
+ out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \
+ out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \
+ \
+ out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+ out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+ out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+ out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+ \
+ out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+ out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+ out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+ out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+ \
+ out2 = _mm_packs_pi32(out2l, out2h); \
+ out6 = _mm_packs_pi32(out6l, out6h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = _mm_add_pi16(tmp4, tmp6); \
+ z4 = _mm_add_pi16(tmp5, tmp7); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = _mm_unpacklo_pi16(z3, z4); \
+ z34h = _mm_unpackhi_pi16(z3, z4); \
+ z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+ z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+ z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+ z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+ \
+ /* (Original) \
+ * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
+ * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
+ * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * out7 = tmp4 + z1 + z3; out5 = tmp5 + z2 + z4; \
+ * out3 = tmp6 + z2 + z3; out1 = tmp7 + z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+ * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+ * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+ * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+ * out7 = tmp4 + z3; out5 = tmp5 + z4; \
+ * out3 = tmp6 + z3; out1 = tmp7 + z4; \
+ */ \
+ \
+ tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \
+ tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \
+ \
+ tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \
+ tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \
+ tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \
+ tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \
+ \
+ out7l = _mm_add_pi32(tmp4l, z3l); \
+ out7h = _mm_add_pi32(tmp4h, z3h); \
+ out1l = _mm_add_pi32(tmp7l, z4l); \
+ out1h = _mm_add_pi32(tmp7h, z4h); \
+ \
+ out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+ out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+ out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+ out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+ \
+ out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+ out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+ out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+ out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+ \
+ out7 = _mm_packs_pi32(out7l, out7h); \
+ out1 = _mm_packs_pi32(out1l, out1h); \
+ \
+ tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \
+ tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \
+ \
+ tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \
+ tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \
+ tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \
+ tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \
+ \
+ out5l = _mm_add_pi32(tmp5l, z4l); \
+ out5h = _mm_add_pi32(tmp5h, z4h); \
+ out3l = _mm_add_pi32(tmp6l, z3l); \
+ out3h = _mm_add_pi32(tmp6h, z3h); \
+ \
+ out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+ out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+ out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+ out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+ \
+ out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+ out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+ out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+ out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+ \
+ out5 = _mm_packs_pi32(out5l, out5h); \
+ out3 = _mm_packs_pi32(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+ __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+ __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+ __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+ __m64 tmp10, tmp11; \
+ \
+ row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
+ row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+ row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
+ row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+ row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
+ row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+ row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
+ row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
+ row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
+ row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
+ row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
+ \
+ row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
+ row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
+ row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
+ row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
+ \
+ col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
+ col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
+ col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
+ col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
+ \
+ tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
+ tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
+ tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
+ tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
+ \
+ col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
+ col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
+ col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
+ col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
+ \
+ tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
+ tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
+ tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
+ tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
+ \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
+ tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
+ tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
+ tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
+ \
+ out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
+ out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
+ out0 = _mm_slli_pi16(out0, PASS1_BITS); \
+ out4 = _mm_slli_pi16(out4, PASS1_BITS); \
+ \
+ DO_FDCT_COMMON(1) \
+ \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+ __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+ __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+ __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+ __m64 tmp10, tmp11; \
+ \
+ col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
+ col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
+ col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
+ col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
+ col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \
+ col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \
+ col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \
+ col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
+ col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
+ col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
+ col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
+ \
+ col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
+ col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
+ col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
+ col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
+ \
+ row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
+ row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
+ row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
+ row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
+ \
+ tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
+ tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
+ tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
+ tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
+ \
+ row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
+ row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
+ row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
+ row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
+ \
+ tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
+ tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
+ tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
+ tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
+ \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
+ tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
+ tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
+ tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
+ \
+ out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
+ out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
+ \
+ out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
+ out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
+ out0 = _mm_srai_pi16(out0, PASS1_BITS); \
+ out4 = _mm_srai_pi16(out4, PASS1_BITS); \
+ \
+ DO_FDCT_COMMON(2) \
+ \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_islow_mmi(DCTELEM *data)
+{
+ __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+ __m64 tmp12, tmp13;
+ DCTELEM *dataptr = data;
+
+ /* Pass 1: process rows. */
+
+ DO_FDCT_PASS1()
+ dataptr += DCTSIZE * 4;
+ DO_FDCT_PASS1()
+
+ /* Pass 2: process columns. */
+
+ dataptr = data;
+ DO_FDCT_PASS2()
+ dataptr += 4;
+ DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jidctfst-mmi.c b/media/libjpeg/simd/mips64/jidctfst-mmi.c
new file mode 100644
index 0000000000..503bb35a3c
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctfst-mmi.c
@@ -0,0 +1,395 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018-2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 8
+#define PASS1_BITS 2
+
+#define FIX_1_082 ((short)277) /* FIX(1.082392200) */
+#define FIX_1_414 ((short)362) /* FIX(1.414213562) */
+#define FIX_1_847 ((short)473) /* FIX(1.847759065) */
+#define FIX_2_613 ((short)669) /* FIX(2.613125930) */
+#define FIX_1_613 ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
+
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+ index_PW_F1082,
+ index_PW_F1414,
+ index_PW_F1847,
+ index_PW_MF1613,
+ index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+ _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
+ _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
+ _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
+ _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
+ _uint64_set1_pi8(CENTERJSAMPLE)
+};
+
+#define PW_F1414 get_const_value(index_PW_F1414)
+#define PW_F1847 get_const_value(index_PW_F1847)
+#define PW_MF1613 get_const_value(index_PW_MF1613)
+#define PW_F1082 get_const_value(index_PW_F1082)
+#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON() { \
+ tmp7 = _mm_add_pi16(z11, z13); \
+ \
+ tmp11 = _mm_sub_pi16(z11, z13); \
+ tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+ tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
+ \
+ tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
+ tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+ \
+ /* To avoid overflow... \
+ * \
+ * (Original) \
+ * tmp12 = -2.613125930 * z10 + z5; \
+ * \
+ * (This implementation) \
+ * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+ * = -1.613125930 * z10 - z10 + z5; \
+ */ \
+ \
+ z5 = _mm_add_pi16(tmp10, tmp12); \
+ z5 = _mm_mulhi_pi16(z5, PW_F1847); \
+ \
+ tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
+ tmp10 = _mm_sub_pi16(tmp10, z5); \
+ tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
+ tmp12 = _mm_sub_pi16(tmp12, z10); \
+ tmp12 = _mm_sub_pi16(tmp12, z10); \
+ tmp12 = _mm_sub_pi16(tmp12, z10); \
+ tmp12 = _mm_add_pi16(tmp12, z5); \
+ \
+ /* Final output stage */ \
+ \
+ tmp6 = _mm_sub_pi16(tmp12, tmp7); \
+ tmp5 = _mm_sub_pi16(tmp11, tmp6); \
+ tmp4 = _mm_add_pi16(tmp10, tmp5); \
+ \
+ out0 = _mm_add_pi16(tmp0, tmp7); \
+ out7 = _mm_sub_pi16(tmp0, tmp7); \
+ out1 = _mm_add_pi16(tmp1, tmp6); \
+ out6 = _mm_sub_pi16(tmp1, tmp6); \
+ \
+ out2 = _mm_add_pi16(tmp2, tmp5); \
+ out5 = _mm_sub_pi16(tmp2, tmp5); \
+ out4 = _mm_add_pi16(tmp3, tmp4); \
+ out3 = _mm_sub_pi16(tmp3, tmp4); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+ __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+ __m64 quant0l, quant1l, quant2l, quant3l; \
+ __m64 quant4l, quant5l, quant6l, quant7l; \
+ __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+ __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+ __m32 col0a, col1a, mm0; \
+ \
+ col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+ col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+ mm0 = _mm_or_si32(col0a, col1a); \
+ \
+ if (test_m32_zero(mm0)) { \
+ __m64 mm1, mm2; \
+ \
+ col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+ col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+ col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+ col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+ col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+ col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+ col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+ col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+ \
+ mm1 = _mm_or_si64(col1l, col3l); \
+ mm2 = _mm_or_si64(col2l, col4l); \
+ mm1 = _mm_or_si64(mm1, col5l); \
+ mm2 = _mm_or_si64(mm2, col6l); \
+ mm1 = _mm_or_si64(mm1, col7l); \
+ mm1 = _mm_or_si64(mm1, mm2); \
+ \
+ if (test_m64_zero(mm1)) { \
+ __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+ \
+ /* AC terms all zero */ \
+ \
+ quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+ \
+ dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval=(00 10 20 30) */ \
+ \
+ dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
+ dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
+ \
+ row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
+ row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
+ row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
+ row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
+ \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+ \
+ goto nextcolumn##iter; \
+ } \
+ } \
+ \
+ /* Even part */ \
+ \
+ col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
+ col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
+ col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \
+ col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \
+ \
+ quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+ quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+ quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+ quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+ \
+ tmp0 = _mm_mullo_pi16(col0l, quant0l); \
+ tmp1 = _mm_mullo_pi16(col2l, quant2l); \
+ tmp2 = _mm_mullo_pi16(col4l, quant4l); \
+ tmp3 = _mm_mullo_pi16(col6l, quant6l); \
+ \
+ tmp10 = _mm_add_pi16(tmp0, tmp2); \
+ tmp11 = _mm_sub_pi16(tmp0, tmp2); \
+ tmp13 = _mm_add_pi16(tmp1, tmp3); \
+ \
+ tmp12 = _mm_sub_pi16(tmp1, tmp3); \
+ tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+ tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+ tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+ \
+ tmp0 = _mm_add_pi16(tmp10, tmp13); \
+ tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+ tmp1 = _mm_add_pi16(tmp11, tmp12); \
+ tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+ \
+ /* Odd part */ \
+ \
+ col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
+ col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
+ col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \
+ col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \
+ \
+ quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+ quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+ quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+ quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+ \
+ tmp4 = _mm_mullo_pi16(col1l, quant1l); \
+ tmp5 = _mm_mullo_pi16(col3l, quant3l); \
+ tmp6 = _mm_mullo_pi16(col5l, quant5l); \
+ tmp7 = _mm_mullo_pi16(col7l, quant7l); \
+ \
+ z13 = _mm_add_pi16(tmp6, tmp5); \
+ z10 = _mm_sub_pi16(tmp6, tmp5); \
+ z11 = _mm_add_pi16(tmp4, tmp7); \
+ z12 = _mm_sub_pi16(tmp4, tmp7); \
+ \
+ DO_IDCT_COMMON() \
+ \
+ /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+ /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+ /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+ /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
+ row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
+ row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
+ row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
+ \
+ row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
+ row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
+ row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
+ row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
+ \
+ row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
+ row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
+ row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
+ row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
+ \
+ row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
+ row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
+ row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
+ row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
+ \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+ __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+ __m64 col0123a, col0123b, col0123c, col0123d; \
+ __m64 col01l, col01h, col23l, col23h; \
+ __m64 col0, col1, col2, col3; \
+ __m64 row06, row17, row24, row35; \
+ \
+ row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
+ row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
+ row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
+ row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
+ row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \
+ row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \
+ row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \
+ row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \
+ \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_pi16(row0l, row4l); \
+ tmp11 = _mm_sub_pi16(row0l, row4l); \
+ tmp13 = _mm_add_pi16(row2l, row6l); \
+ \
+ tmp12 = _mm_sub_pi16(row2l, row6l); \
+ tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+ tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+ tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+ \
+ tmp0 = _mm_add_pi16(tmp10, tmp13); \
+ tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+ tmp1 = _mm_add_pi16(tmp11, tmp12); \
+ tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+ \
+ /* Odd part */ \
+ \
+ z13 = _mm_add_pi16(row5l, row3l); \
+ z10 = _mm_sub_pi16(row5l, row3l); \
+ z11 = _mm_add_pi16(row1l, row7l); \
+ z12 = _mm_sub_pi16(row1l, row7l); \
+ \
+ DO_IDCT_COMMON() \
+ \
+ /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+ /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+ /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+ /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+ \
+ out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
+ out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
+ out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
+ out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
+ out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
+ out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
+ out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
+ out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
+ \
+ row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
+ row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
+ row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
+ row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
+ \
+ row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+ row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+ row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+ row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+ \
+ /* Transpose coefficients */ \
+ \
+ col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
+ col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
+ col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
+ col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
+ \
+ col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
+ col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
+ col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
+ col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
+ \
+ col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
+ col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
+ col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
+ col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
+ \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m64 tmp10, tmp11, tmp12, tmp13;
+ __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+ __m64 z5, z10, z11, z12, z13;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ JCOEF *wsptr;
+ JCOEF workspace[DCTSIZE2]; /* buffers data between passes */
+
+ /* Pass 1: process columns. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)dct_table;
+ wsptr = workspace;
+
+ DO_IDCT_PASS1(1)
+nextcolumn1:
+ inptr += 4;
+ quantptr += 4;
+ wsptr += DCTSIZE * 4;
+ DO_IDCT_PASS1(2)
+nextcolumn2:
+
+ /* Pass 2: process rows. */
+
+ wsptr = workspace;
+
+ DO_IDCT_PASS2(0)
+ wsptr += 4;
+ DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jidctint-mmi.c b/media/libjpeg/simd/mips64/jidctint-mmi.c
new file mode 100644
index 0000000000..cd3db980c5
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctint-mmi.c
@@ -0,0 +1,571 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCUATE INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+#define CENTERJSAMPLE 128
+
+#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
+#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
+#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
+#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
+#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
+#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
+#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
+#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
+#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
+#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
+#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
+#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
+
+enum const_index {
+ index_PW_F130_F054,
+ index_PW_F054_MF130,
+ index_PW_MF078_F117,
+ index_PW_F117_F078,
+ index_PW_MF060_MF089,
+ index_PW_MF089_F060,
+ index_PW_MF050_MF256,
+ index_PW_MF256_F050,
+ index_PD_DESCALE_P1,
+ index_PD_DESCALE_P2,
+ index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+ FIX_0_541, (FIX_0_541 + FIX_0_765)),
+ _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+ (FIX_0_541 - FIX_1_847), FIX_0_541),
+ _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+ FIX_1_175, (FIX_1_175 - FIX_1_961)),
+ _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+ (FIX_1_175 - FIX_0_390), FIX_1_175),
+ _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+ -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+ _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+ (FIX_1_501 - FIX_0_899), -FIX_0_899),
+ _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+ -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+ _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+ (FIX_3_072 - FIX_2_562), -FIX_2_562),
+ _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+ _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+ _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE,
+ CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE)
+};
+
+#define PW_F130_F054 get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130 get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117 get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078 get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060 get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050 get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2)
+#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON(PASS) { \
+ __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \
+ __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+ __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+ __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \
+ __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \
+ \
+ z3 = _mm_add_pi16(tmp0, tmp2); \
+ z4 = _mm_add_pi16(tmp1, tmp3); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = _mm_unpacklo_pi16(z3, z4); \
+ z34h = _mm_unpackhi_pi16(z3, z4); \
+ z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+ z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+ z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+ z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+ \
+ /* (Original) \
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * tmp0 += z1 + z3; tmp1 += z2 + z4; \
+ * tmp2 += z2 + z3; tmp3 += z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+ * tmp0 += z3; tmp1 += z4; \
+ * tmp2 += z3; tmp3 += z4; \
+ */ \
+ \
+ tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \
+ tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \
+ \
+ tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \
+ tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \
+ tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \
+ tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \
+ \
+ tmp0l = _mm_add_pi32(tmp0l, z3l); \
+ tmp0h = _mm_add_pi32(tmp0h, z3h); \
+ tmp3l = _mm_add_pi32(tmp3l, z4l); \
+ tmp3h = _mm_add_pi32(tmp3h, z4h); \
+ \
+ tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \
+ tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \
+ \
+ tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \
+ tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \
+ tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \
+ tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \
+ \
+ tmp1l = _mm_add_pi32(tmp1l, z4l); \
+ tmp1h = _mm_add_pi32(tmp1h, z4h); \
+ tmp2l = _mm_add_pi32(tmp2l, z3l); \
+ tmp2h = _mm_add_pi32(tmp2h, z3h); \
+ \
+ /* Final output stage */ \
+ \
+ out0l = _mm_add_pi32(tmp10l, tmp3l); \
+ out0h = _mm_add_pi32(tmp10h, tmp3h); \
+ out7l = _mm_sub_pi32(tmp10l, tmp3l); \
+ out7h = _mm_sub_pi32(tmp10h, tmp3h); \
+ \
+ out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \
+ out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \
+ out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \
+ out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \
+ \
+ out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+ out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+ out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+ out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+ \
+ out0 = _mm_packs_pi32(out0l, out0h); \
+ out7 = _mm_packs_pi32(out7l, out7h); \
+ \
+ out1l = _mm_add_pi32(tmp11l, tmp2l); \
+ out1h = _mm_add_pi32(tmp11h, tmp2h); \
+ out6l = _mm_sub_pi32(tmp11l, tmp2l); \
+ out6h = _mm_sub_pi32(tmp11h, tmp2h); \
+ \
+ out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+ out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+ out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+ out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+ \
+ out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+ out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+ out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+ out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+ \
+ out1 = _mm_packs_pi32(out1l, out1h); \
+ out6 = _mm_packs_pi32(out6l, out6h); \
+ \
+ out2l = _mm_add_pi32(tmp12l, tmp1l); \
+ out2h = _mm_add_pi32(tmp12h, tmp1h); \
+ out5l = _mm_sub_pi32(tmp12l, tmp1l); \
+ out5h = _mm_sub_pi32(tmp12h, tmp1h); \
+ \
+ out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+ out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+ out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+ out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+ \
+ out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+ out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+ out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+ out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+ \
+ out2 = _mm_packs_pi32(out2l, out2h); \
+ out5 = _mm_packs_pi32(out5l, out5h); \
+ \
+ out3l = _mm_add_pi32(tmp13l, tmp0l); \
+ out3h = _mm_add_pi32(tmp13h, tmp0h); \
+ \
+ out4l = _mm_sub_pi32(tmp13l, tmp0l); \
+ out4h = _mm_sub_pi32(tmp13h, tmp0h); \
+ \
+ out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+ out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+ out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+ out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+ \
+ out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \
+ out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \
+ out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \
+ out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \
+ \
+ out3 = _mm_packs_pi32(out3l, out3h); \
+ out4 = _mm_packs_pi32(out4l, out4h); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+ __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+ __m64 quant0l, quant1l, quant2l, quant3l; \
+ __m64 quant4l, quant5l, quant6l, quant7l; \
+ __m64 z23, z2, z3, z23l, z23h; \
+ __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+ __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+ __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+ __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+ __m32 col0a, col1a, mm0; \
+ \
+ col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+ col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+ mm0 = _mm_or_si32(col0a, col1a); \
+ \
+ if (test_m32_zero(mm0)) { \
+ __m64 mm1, mm2; \
+ \
+ col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+ col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+ col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+ col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+ col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+ col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+ col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+ col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+ \
+ mm1 = _mm_or_si64(col1l, col3l); \
+ mm2 = _mm_or_si64(col2l, col4l); \
+ mm1 = _mm_or_si64(mm1, col5l); \
+ mm2 = _mm_or_si64(mm2, col6l); \
+ mm1 = _mm_or_si64(mm1, col7l); \
+ mm1 = _mm_or_si64(mm1, mm2); \
+ \
+ if (test_m64_zero(mm1)) { \
+ __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+ \
+ /* AC terms all zero */ \
+ \
+ quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+ \
+ dcval = _mm_mullo_pi16(col0l, quant0l); \
+ dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \
+ \
+ dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
+ dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
+ \
+ row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
+ row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
+ row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
+ row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
+ \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+ \
+ goto nextcolumn##iter; \
+ } \
+ } \
+ \
+ /* Even part \
+ * \
+ * (Original) \
+ * z1 = (z2 + z3) * 0.541196100; \
+ * tmp2 = z1 + z3 * -1.847759065; \
+ * tmp3 = z1 + z2 * 0.765366865; \
+ * \
+ * (This implementation) \
+ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+ */ \
+ \
+ col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
+ col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
+ col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \
+ col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \
+ \
+ quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+ quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+ quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+ quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+ \
+ z2 = _mm_mullo_pi16(col2l, quant2l); \
+ z3 = _mm_mullo_pi16(col6l, quant6l); \
+ \
+ z23l = _mm_unpacklo_pi16(z2, z3); \
+ z23h = _mm_unpackhi_pi16(z2, z3); \
+ tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+ tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+ tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+ tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+ \
+ z2 = _mm_mullo_pi16(col0l, quant0l); \
+ z3 = _mm_mullo_pi16(col4l, quant4l); \
+ \
+ z23 = _mm_add_pi16(z2, z3); \
+ tmp0l = _mm_loadlo_pi16_f(z23); \
+ tmp0h = _mm_loadhi_pi16_f(z23); \
+ tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+ tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+ \
+ tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+ tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+ tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+ tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+ \
+ z23 = _mm_sub_pi16(z2, z3); \
+ tmp1l = _mm_loadlo_pi16_f(z23); \
+ tmp1h = _mm_loadhi_pi16_f(z23); \
+ tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+ tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+ \
+ tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+ tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+ tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+ tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+ \
+ /* Odd part */ \
+ \
+ col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
+ col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
+ col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \
+ col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \
+ \
+ quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+ quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+ quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+ quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+ \
+ tmp0 = _mm_mullo_pi16(col7l, quant7l); \
+ tmp1 = _mm_mullo_pi16(col5l, quant5l); \
+ tmp2 = _mm_mullo_pi16(col3l, quant3l); \
+ tmp3 = _mm_mullo_pi16(col1l, quant1l); \
+ \
+ DO_IDCT_COMMON(1) \
+ \
+ /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+ /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+ /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+ /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
+ row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
+ row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
+ row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
+ \
+ row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
+ row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
+ row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
+ row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
+ \
+ row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
+ row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
+ row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
+ row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
+ \
+ row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
+ row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
+ row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
+ row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
+ \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+ __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+ __m64 z23, z23l, z23h; \
+ __m64 col0123a, col0123b, col0123c, col0123d; \
+ __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \
+ __m64 col0, col1, col2, col3; \
+ __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+ __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+ \
+ row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
+ row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
+ row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
+ row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
+ row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \
+ row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \
+ row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \
+ row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \
+ \
+ /* Even part \
+ * \
+ * (Original) \
+ * z1 = (z2 + z3) * 0.541196100; \
+ * tmp2 = z1 + z3 * -1.847759065; \
+ * tmp3 = z1 + z2 * 0.765366865; \
+ * \
+ * (This implementation) \
+ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+ */ \
+ \
+ z23l = _mm_unpacklo_pi16(row2l, row6l); \
+ z23h = _mm_unpackhi_pi16(row2l, row6l); \
+ \
+ tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+ tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+ tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+ tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+ \
+ z23 = _mm_add_pi16(row0l, row4l); \
+ tmp0l = _mm_loadlo_pi16_f(z23); \
+ tmp0h = _mm_loadhi_pi16_f(z23); \
+ tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+ tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+ \
+ tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+ tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+ tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+ tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+ \
+ z23 = _mm_sub_pi16(row0l, row4l); \
+ tmp1l = _mm_loadlo_pi16_f(z23); \
+ tmp1h = _mm_loadhi_pi16_f(z23); \
+ tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+ tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+ \
+ tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+ tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+ tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+ tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+ \
+ /* Odd part */ \
+ \
+ tmp0 = row7l; \
+ tmp1 = row5l; \
+ tmp2 = row3l; \
+ tmp3 = row1l; \
+ \
+ DO_IDCT_COMMON(2) \
+ \
+ /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+ /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+ /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+ /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+ \
+ row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
+ row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
+ row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
+ row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
+ \
+ row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+ row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+ row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+ row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+ \
+ /* Transpose coefficients */ \
+ \
+ col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
+ col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
+ col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
+ col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
+ \
+ col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
+ col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
+ col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
+ col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
+ \
+ col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
+ col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
+ col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
+ col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
+ \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ __m64 tmp0, tmp1, tmp2, tmp3;
+ __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ JCOEF *wsptr;
+ JCOEF workspace[DCTSIZE2]; /* buffers data between passes */
+
+ /* Pass 1: process columns. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)dct_table;
+ wsptr = workspace;
+
+ DO_IDCT_PASS1(1)
+nextcolumn1:
+ inptr += 4;
+ quantptr += 4;
+ wsptr += DCTSIZE * 4;
+ DO_IDCT_PASS1(2)
+nextcolumn2:
+
+ /* Pass 2: process rows. */
+
+ wsptr = workspace;
+
+ DO_IDCT_PASS2(0)
+ wsptr += 4;
+ DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jquanti-mmi.c b/media/libjpeg/simd/mips64/jquanti-mmi.c
new file mode 100644
index 0000000000..339002fd80
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jquanti-mmi.c
@@ -0,0 +1,124 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018-2019, D. R. Commander. All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+ __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+ __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
+ \
+ rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+ rowh = _mm_load_si64((__m64 *)&workspace[4]); \
+ \
+ /* Branch-less absolute value */ \
+ rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1)); /* -1 if value < 0, */ \
+ /* 0 otherwise */ \
+ rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
+ \
+ rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \
+ rowh = _mm_xor_si64(rowh, rowhs); \
+ rowl = _mm_sub_pi16(rowl, rowls); \
+ rowh = _mm_sub_pi16(rowh, rowhs); \
+ \
+ corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \
+ corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+ \
+ rowlsave = rowl = _mm_add_pi16(rowl, corrl); /* correction + roundfactor */ \
+ rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
+ \
+ recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \
+ reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+ \
+ rowl = _mm_mulhi_pi16(rowl, recipl); \
+ rowh = _mm_mulhi_pi16(rowh, reciph); \
+ \
+ /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+ /* initial value (input value is never negative as we inverted it at the */ \
+ /* start of this routine) */ \
+ rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+ rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
+ \
+ scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \
+ scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+ \
+ rowl = _mm_mulhi_pi16(rowl, scalel); \
+ rowh = _mm_mulhi_pi16(rowh, scaleh); \
+ \
+ /* determine if scale is negative */ \
+ scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+ scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
+ \
+ /* and add input if it is */ \
+ scalel = _mm_and_si64(scalel, rowlsave); \
+ scaleh = _mm_and_si64(scaleh, rowhsave); \
+ rowl = _mm_add_pi16(rowl, scalel); \
+ rowh = _mm_add_pi16(rowh, scaleh); \
+ \
+ /* then check if negative input */ \
+ rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+ rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
+ \
+ /* and add scale if it is */ \
+ rowlsave = _mm_and_si64(rowlsave, scalel); \
+ rowhsave = _mm_and_si64(rowhsave, scaleh); \
+ rowl = _mm_add_pi16(rowl, rowlsave); \
+ rowh = _mm_add_pi16(rowh, rowhsave); \
+ \
+ rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \
+ rowh = _mm_xor_si64(rowh, rowhs); \
+ rowl = _mm_sub_pi16(rowl, rowls); \
+ rowh = _mm_sub_pi16(rowh, rowhs); \
+ \
+ _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+ _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
+ \
+ workspace += DCTSIZE; \
+ divisors += DCTSIZE; \
+ output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace)
+{
+ JCOEFPTR output_ptr = coef_block;
+
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+}
diff --git a/media/libjpeg/simd/mips64/jsimd.c b/media/libjpeg/simd/mips64/jsimd.c
new file mode 100644
index 0000000000..e8f1af562b
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd.c
@@ -0,0 +1,870 @@
+/*
+ * jsimd_mips64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, 2018, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+ char *p;
+
+ if (*feature == 0)
+ return 0;
+ if (strncmp(buffer, "ASEs implemented", 16) != 0)
+ return 0;
+ buffer += 16;
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'feature' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, feature))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(feature);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ simd_support = 0;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_feature(buffer, "loongson-mmi"))
+ simd_support |= JSIMD_MMI;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char *env = NULL;
+#endif
+#if defined(__linux__)
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__linux__)
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#elif defined(__mips_loongson_vector_rev)
+ /* Only enable MMI by default on non-Linux platforms when the compiler flags
+ * support it. */
+ simd_support |= JSIMD_MMI;
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCEMMI");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = JSIMD_MMI;
+ env = getenv("JSIMD_FORCENONE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_extrgb_ycc_convert_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_extrgbx_ycc_convert_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_extbgr_ycc_convert_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_extbgrx_ycc_convert_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_extxbgr_ycc_convert_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_extxrgb_ycc_convert_mmi;
+ break;
+ default:
+ mmifct = jsimd_rgb_ycc_convert_mmi;
+ break;
+ }
+
+ mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_extrgb_gray_convert_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_extrgbx_gray_convert_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_extbgr_gray_convert_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_extbgrx_gray_convert_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_extxbgr_gray_convert_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_extxrgb_gray_convert_mmi;
+ break;
+ default:
+ mmifct = jsimd_rgb_gray_convert_mmi;
+ break;
+ }
+
+ mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_ycc_extrgb_convert_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_ycc_extrgbx_convert_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_ycc_extbgr_convert_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_ycc_extbgrx_convert_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_ycc_extxbgr_convert_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_ycc_extxrgb_convert_mmi;
+ break;
+ default:
+ mmifct = jsimd_ycc_rgb_convert_mmi;
+ break;
+ }
+
+ mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi;
+ break;
+ default:
+ mmifct = jsimd_h2v2_merged_upsample_mmi;
+ break;
+ }
+
+ mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi;
+ break;
+ default:
+ mmifct = jsimd_h2v1_merged_upsample_mmi;
+ break;
+ }
+
+ mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_mmi(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
diff --git a/media/libjpeg/simd/mips64/jsimd_mmi.h b/media/libjpeg/simd/mips64/jsimd_mmi.h
new file mode 100644
index 0000000000..5e4261c9d9
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd_mmi.h
@@ -0,0 +1,69 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * QingfaLiu <liuqingfa-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jdct.h"
+#include "loongson-mmintrin.h"
+
+
+/* Common code */
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define PTR_ADDU "daddu "
+# define PTR_SLL "dsll "
+#else
+# define PTR_ADDU "addu "
+# define PTR_SLL "sll "
+#endif
+
+#define SIZEOF_MMWORD 8
+#define BYTE_BIT 8
+#define WORD_BIT 16
+#define SCALEBITS 16
+
+#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \
+ (((uint64_t)(uint8_t)a << 56) | \
+ ((uint64_t)(uint8_t)b << 48) | \
+ ((uint64_t)(uint8_t)c << 40) | \
+ ((uint64_t)(uint8_t)d << 32) | \
+ ((uint64_t)(uint8_t)e << 24) | \
+ ((uint64_t)(uint8_t)f << 16) | \
+ ((uint64_t)(uint8_t)g << 8) | \
+ ((uint64_t)(uint8_t)h))
+#define _uint64_set1_pi8(a) _uint64_set_pi8(a, a, a, a, a, a, a, a)
+#define _uint64_set_pi16(a, b, c, d) \
+ (((uint64_t)(uint16_t)a << 48) | \
+ ((uint64_t)(uint16_t)b << 32) | \
+ ((uint64_t)(uint16_t)c << 16) | \
+ ((uint64_t)(uint16_t)d))
+#define _uint64_set1_pi16(a) _uint64_set_pi16(a, a, a, a)
+#define _uint64_set_pi32(a, b) \
+ (((uint64_t)(uint32_t)a << 32) | \
+ ((uint64_t)(uint32_t)b))
+
+#define get_const_value(index) (*(__m64 *)&const_value[index])
diff --git a/media/libjpeg/simd/mips64/loongson-mmintrin.h b/media/libjpeg/simd/mips64/loongson-mmintrin.h
new file mode 100644
index 0000000000..db9b35ab60
--- /dev/null
+++ b/media/libjpeg/simd/mips64/loongson-mmintrin.h
@@ -0,0 +1,1334 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Copyright (C) 2019, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef __LOONGSON_MMINTRIN_H__
+#define __LOONGSON_MMINTRIN_H__
+
+#include <stdint.h>
+
+
+#define FUNCTION_ATTRIBS \
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+
+/* Vectors are stored in 64-bit floating-point registers. */
+typedef double __m64;
+
+/* Having a 32-bit datatype allows us to use 32-bit loads in places like
+ load8888. */
+typedef float __m32;
+
+
+/********** Set Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setzero_si64(void)
+{
+ return 0.0;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
+ uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
+{
+ __m64 ret;
+ uint32_t lo = ((uint32_t)__b6 << 24) |
+ ((uint32_t)__b4 << 16) |
+ ((uint32_t)__b2 << 8) |
+ (uint32_t)__b0;
+ uint32_t hi = ((uint32_t)__b7 << 24) |
+ ((uint32_t)__b5 << 16) |
+ ((uint32_t)__b3 << 8) |
+ (uint32_t)__b1;
+
+ asm("mtc1 %1, %0\n\t"
+ "mtc1 %2, $f0\n\t"
+ "punpcklbh %0, %0, $f0\n\t"
+ : "=f" (ret)
+ : "r" (lo), "r" (hi)
+ : "$f0"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
+{
+ __m64 ret;
+ uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
+ uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
+
+ asm("mtc1 %1, %0\n\t"
+ "mtc1 %2, $f0\n\t"
+ "punpcklhw %0, %0, $f0\n\t"
+ : "=f" (ret)
+ : "r" (lo), "r" (hi)
+ : "$f0"
+ );
+
+ return ret;
+}
+
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi32(uint32_t __i1, uint32_t __i0)
+{
+ if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
+ uint64_t val = ((uint64_t)__i1 << 32) |
+ ((uint64_t)__i0 << 0);
+
+ return *(__m64 *)&val;
+ } else if (__i1 == __i0) {
+ uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
+ __m64 ret;
+
+ asm("pshufh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+ );
+
+ return ret;
+ } else {
+ uint64_t val = ((uint64_t)__i1 << 32) |
+ ((uint64_t)__i0 << 0);
+
+ return *(__m64 *)&val;
+ }
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi8(uint8_t __b0)
+{
+ __m64 ret;
+
+ asm("sll $8, %1, 8\n\t"
+ "or %1, %1, $8\n\t"
+ "mtc1 %1, %0\n\t"
+ "mtc1 $0, $f0\n\t"
+ "pshufh %0, %0, $f0\n\t"
+ : "=f" (ret)
+ : "r" (__b0)
+ : "$8", "$f0"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi16(uint16_t __h0)
+{
+ __m64 ret;
+
+ asm("mtc1 %1, %0\n\t"
+ "mtc1 $0, $f0\n\t"
+ "pshufh %0, %0, $f0\n\t"
+ : "=f" (ret)
+ : "r" (__h0)
+ : "$8", "$f0"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi32(unsigned __i0)
+{
+ return _mm_set_pi32(__i0, __i0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
+ uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
+{
+ return _mm_set_pi8(__h7, __h6, __h5, __h4,
+ __h3, __h2, __h1, __h0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
+{
+ return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi32(uint32_t __i0, uint32_t __i1)
+{
+ return _mm_set_pi32(__i1, __i0);
+}
+
+
+/********** Arithmetic Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddsb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddsh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddusb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddush %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pavgb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pavgh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmaddhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmaxsh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmaxub %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pminsh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pminub %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline int FUNCTION_ATTRIBS
+_mm_movemask_pi8(__m64 __m1)
+{
+ int ret;
+
+ asm("pmovmskb %0, %1\n\t"
+ : "=r" (ret)
+ : "y" (__m1)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmulhh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmulhuh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmullh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mul_pu32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmuluw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sad_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psadbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_asub_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pasubub %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_biadd_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("biadd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubsb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubsh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubusb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubush %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+/********** Logical Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("and %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("andn %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si32(__m32 __m1, __m32 __m2)
+{
+ __m32 ret;
+
+ asm("or %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("or %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("xor %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+/********** Shift Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi16(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psllh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi32(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psllw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_si64(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("dsll %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi16(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psrlh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi32(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psrlw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_si64(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("dsrl %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi16(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psrah %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi32(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psraw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_si64(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("dsra %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+
+/********** Conversion Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+to_m64(uint64_t x)
+{
+ return *(__m64 *)&x;
+}
+
+extern __inline uint64_t FUNCTION_ATTRIBS
+to_uint64(__m64 x)
+{
+ return *(uint64_t *)&x;
+}
+
+
+/********** Comparison Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpeqb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpeqh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpeqw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpgtb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpgth %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpgtw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpltb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmplth %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpltw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+/********** Miscellaneous Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("packsshb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("packsswh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("packsswh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("packushb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_extract_pi16(__m64 __m, int64_t __pos)
+{
+ __m64 ret;
+
+ asm("pextrh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__pos)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
+{
+ __m64 ret;
+
+ switch (__pos) {
+ case 0:
+
+ asm("pinsrh_0 %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2), "i" (__pos)
+ );
+
+ break;
+
+ case 1:
+
+ asm("pinsrh_1 %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2), "i" (__pos)
+ );
+
+ break;
+ case 2:
+
+ asm("pinsrh_2 %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2), "i" (__pos)
+ );
+
+ break;
+
+ case 3:
+
+ asm("pinsrh_3 %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2), "i" (__pos)
+ );
+
+ break;
+ }
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_shuffle_pi16(__m64 __m, int64_t __n)
+{
+ __m64 ret;
+
+ asm("pshufh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__n)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhwd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
+ which preserves the data. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
+ datatype, which allows load8888 to use 32-bit loads. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklwd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklwd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_pi32(__m32 *dest, __m64 src)
+{
+ src = _mm_packs_pu16(src, _mm_setzero_si64());
+
+ asm("swc1 %1, %0\n\t"
+ : "=m" (*dest)
+ : "f" (src)
+ : "memory"
+ );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_si64(__m64 *dest, __m64 src)
+{
+ asm("sdc1 %1, %0 \n\t"
+ : "=m" (*dest)
+ : "f" (src)
+ : "memory"
+ );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_storeu_si64(__m64 *dest, __m64 src)
+{
+ asm("gssdlc1 %1, 7(%0) \n\t"
+ "gssdrc1 %1, 0(%0) \n\t"
+ :
+ : "r" (dest), "f" (src)
+ : "memory"
+ );
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si32(const __m32 *src)
+{
+ __m32 ret;
+
+ asm("lwc1 %0, %1\n\t"
+ : "=f" (ret)
+ : "m" (*src)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si64(const __m64 *src)
+{
+ __m64 ret;
+
+ asm("ldc1 %0, %1\n\t"
+ : "=f" (ret)
+ : "m" (*src)
+ : "memory"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadu_si64(const __m64 *src)
+{
+ __m64 ret;
+
+ asm("gsldlc1 %0, 7(%1)\n\t"
+ "gsldrc1 %0, 0(%1)\n\t"
+ : "=f" (ret)
+ : "r" (src)
+ : "memory"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8(const uint32_t *src)
+{
+ return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8_f(__m64 src)
+{
+ return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi8_f(__m64 src)
+{
+ return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16(__m64 src)
+{
+ return _mm_unpacklo_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16_f(__m64 src)
+{
+ return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16(__m64 src)
+{
+ return _mm_unpackhi_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16_f(__m64 src)
+{
+ return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha(__m64 pixel)
+{
+ return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha_rev(__m64 pixel)
+{
+ return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+#endif /* __LOONGSON_MMINTRIN_H__ */
diff --git a/media/libjpeg/simd/nasm/jcolsamp.inc b/media/libjpeg/simd/nasm/jcolsamp.inc
new file mode 100644
index 0000000000..6f6d7f29d1
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jcolsamp.inc
@@ -0,0 +1,135 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define mmA mm0
+%define mmB mm1
+%define xmmA xmm0
+%define xmmB xmm1
+%define ymmA ymm0
+%define ymmB ymm1
+%elif RGB_GREEN == 0
+%define mmA mm2
+%define mmB mm3
+%define xmmA xmm2
+%define xmmB xmm3
+%define ymmA ymm2
+%define ymmB ymm3
+%elif RGB_BLUE == 0
+%define mmA mm4
+%define mmB mm5
+%define xmmA xmm4
+%define xmmB xmm5
+%define ymmA ymm4
+%define ymmB ymm5
+%else
+%define mmA mm6
+%define mmB mm7
+%define xmmA xmm6
+%define xmmB xmm7
+%define ymmA ymm6
+%define ymmB ymm7
+%endif
+
+%if RGB_RED == 1
+%define mmC mm0
+%define mmD mm1
+%define xmmC xmm0
+%define xmmD xmm1
+%define ymmC ymm0
+%define ymmD ymm1
+%elif RGB_GREEN == 1
+%define mmC mm2
+%define mmD mm3
+%define xmmC xmm2
+%define xmmD xmm3
+%define ymmC ymm2
+%define ymmD ymm3
+%elif RGB_BLUE == 1
+%define mmC mm4
+%define mmD mm5
+%define xmmC xmm4
+%define xmmD xmm5
+%define ymmC ymm4
+%define ymmD ymm5
+%else
+%define mmC mm6
+%define mmD mm7
+%define xmmC xmm6
+%define xmmD xmm7
+%define ymmC ymm6
+%define ymmD ymm7
+%endif
+
+%if RGB_RED == 2
+%define mmE mm0
+%define mmF mm1
+%define xmmE xmm0
+%define xmmF xmm1
+%define ymmE ymm0
+%define ymmF ymm1
+%elif RGB_GREEN == 2
+%define mmE mm2
+%define mmF mm3
+%define xmmE xmm2
+%define xmmF xmm3
+%define ymmE ymm2
+%define ymmF ymm3
+%elif RGB_BLUE == 2
+%define mmE mm4
+%define mmF mm5
+%define xmmE xmm4
+%define xmmF xmm5
+%define ymmE ymm4
+%define ymmF ymm5
+%else
+%define mmE mm6
+%define mmF mm7
+%define xmmE xmm6
+%define xmmF xmm7
+%define ymmE ymm6
+%define ymmF ymm7
+%endif
+
+%if RGB_RED == 3
+%define mmG mm0
+%define mmH mm1
+%define xmmG xmm0
+%define xmmH xmm1
+%define ymmG ymm0
+%define ymmH ymm1
+%elif RGB_GREEN == 3
+%define mmG mm2
+%define mmH mm3
+%define xmmG xmm2
+%define xmmH xmm3
+%define ymmG ymm2
+%define ymmH ymm3
+%elif RGB_BLUE == 3
+%define mmG mm4
+%define mmH mm5
+%define xmmG xmm4
+%define xmmH xmm5
+%define ymmG ymm4
+%define ymmH ymm5
+%else
+%define mmG mm6
+%define mmH mm7
+%define xmmG xmm6
+%define xmmH xmm7
+%define ymmG ymm6
+%define ymmH ymm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jdct.inc b/media/libjpeg/simd/nasm/jdct.inc
index b9761071e9..9192f66f0c 100644
--- a/media/libjpeg/simd/jdct.inc
+++ b/media/libjpeg/simd/nasm/jdct.inc
@@ -2,12 +2,11 @@
; jdct.inc - private declarations for forward & reverse DCT subsystems
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2018, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
; Each IDCT routine is responsible for range-limiting its results and
; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
@@ -17,11 +16,16 @@
;
%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples
-%define ROW(n,b,s) ((b)+(n)*(s))
-%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE)
+%define ROW(n, b, s) ((b) + (n) * (s))
+%define COL(n, b, s) ((b) + (n) * (s) * DCTSIZE)
-%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
-%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
-%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+%define DWBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD)
+%define MMBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD)
+%define XMMBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD)
+%define YMMBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD)
; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jsimdcfg.inc b/media/libjpeg/simd/nasm/jsimdcfg.inc
index 9d4aedec9e..667024a5f9 100755..100644
--- a/media/libjpeg/simd/jsimdcfg.inc
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc
@@ -90,5 +90,4 @@
%define JSIMD_3DNOW 0x02
%define JSIMD_SSE 0x04
%define JSIMD_SSE2 0x08
-; Short forms of external names for systems with brain-damaged linkers.
-;
+%define JSIMD_AVX2 0x80
diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc.h b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
new file mode 100644
index 0000000000..bf2a45ad50
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
@@ -0,0 +1,133 @@
+/*
+ * This file generates the include file for the assembly
+ * implementations by abusing the C preprocessor.
+ *
+ * Note: Some things are manually defined as they need to
+ * be mapped to NASM types.
+ */
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+;
+; -- jpeglib.h
+;
+
+%define _cpp_protection_DCTSIZE DCTSIZE
+%define _cpp_protection_DCTSIZE2 DCTSIZE2
+
+;
+; -- jmorecfg.h
+;
+
+%define _cpp_protection_RGB_RED RGB_RED
+%define _cpp_protection_RGB_GREEN RGB_GREEN
+%define _cpp_protection_RGB_BLUE RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
+
+%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
+
+%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
+
+%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
+
+%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+
+%define RGBX_FILLER_0XFF 1
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+
+%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+
+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+
+%define FAST_FLOAT FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+%define _cpp_protection_JSIMD_NONE JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
+%define _cpp_protection_JSIMD_AVX2 JSIMD_AVX2
diff --git a/media/libjpeg/simd/nasm/jsimdext.inc b/media/libjpeg/simd/nasm/jsimdext.inc
new file mode 100644
index 0000000000..e8d50b0349
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdext.inc
@@ -0,0 +1,520 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
+; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; ==========================================================================
+; System-dependent configurations
+
+%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT .text align=32
+%define SEG_CONST .rdata align=32
+%else
+%define SEG_TEXT .text align=32 public use32 class=CODE
+%define SEG_CONST .rdata align=32 public use32 class=CONST
+%endif
+
+%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
+; * Microsoft Visual C++
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT .text align=32
+%define SEG_CONST .rdata align=32
+%else
+%define SEG_TEXT .text align=32 public use64 class=CODE
+%define SEG_CONST .rdata align=32 public use64 class=CONST
+%endif
+%define EXTN(name) name ; foo() -> foo
+
+%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT _text align=32 public use32 class=CODE
+%define SEG_CONST _data align=32 public use32 class=DATA
+
+%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT .text progbits align=32
+%define SEG_CONST .rodata progbits align=32
+%else
+%define SEG_TEXT .text progbits alloc exec nowrite align=32
+%define SEG_CONST .rodata progbits alloc noexec nowrite align=32
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
+%define EXTN(name) name ; foo() -> foo
+
+%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT .text
+%define SEG_CONST .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
+
+%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why?
+%define SEG_CONST .rodata align=32
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
+
+%else ; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT .text
+%define SEG_CONST .data
+
+%endif ; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+; Common types
+;
+%ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
+%define POINTER qword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
+%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%define resp resq
+%define dp dq
+%define raxp rax
+%define rbxp rbx
+%define rcxp rcx
+%define rdxp rdx
+%define rsip rsi
+%define rdip rdi
+%define rbpp rbp
+%define rspp rsp
+%define r8p r8
+%define r9p r9
+%define r10p r10
+%define r11p r11
+%define r12p r12
+%define r13p r13
+%define r14p r14
+%define r15p r15
+%endif
+%endif
+%ifndef raxp
+%define POINTER dword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
+%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%define resp resd
+%define dp dd
+; x86_64 ILP32 ABI (x32)
+%define raxp eax
+%define rbxp ebx
+%define rcxp ecx
+%define rdxp edx
+%define rsip esi
+%define rdip edi
+%define rbpp ebp
+%define rspp esp
+%define r8p r8d
+%define r9p r9d
+%define r10p r10d
+%define r11p r11d
+%define r12p r12d
+%define r13p r13d
+%define r14p r14d
+%define r15p r15d
+%endif
+
+%define INT dword ; signed integer type
+%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
+%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
+
+%define FP32 dword ; IEEE754 single
+%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
+%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD qword ; int64 (MMX register)
+%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
+%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD ; int128 (SSE register)
+%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
+%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
+
+%define YMMWORD ; int256 (AVX register)
+%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD)
+%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE 1 ; sizeof(byte)
+%define SIZEOF_WORD 2 ; sizeof(word)
+%define SIZEOF_DWORD 4 ; sizeof(dword)
+%define SIZEOF_QWORD 8 ; sizeof(qword)
+%define SIZEOF_OWORD 16 ; sizeof(oword)
+%define SIZEOF_YWORD 32 ; sizeof(yword)
+
+%define BYTE_BIT 8 ; CHAR_BIT in C
+%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+; External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name) _ %+ name ; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+; Hidden symbols
+;
+%ifdef ELF ; ----(nasm -felf[64] -DELF ...)--------
+%define GLOBAL_FUNCTION(name) global EXTN(name):function hidden
+%define GLOBAL_DATA(name) global EXTN(name):data hidden
+%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
+%ifdef __YASM_VER__
+%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
+%define GLOBAL_DATA(name) global EXTN(name):private_extern
+%else
+%if __NASM_VERSION_ID__ >= 0x020E0000
+%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
+%define GLOBAL_DATA(name) global EXTN(name):private_extern
+%endif
+%endif
+%endif
+
+%ifndef GLOBAL_FUNCTION
+%define GLOBAL_FUNCTION(name) global EXTN(name)
+%endif
+%ifndef GLOBAL_DATA
+%define GLOBAL_DATA(name) global EXTN(name)
+%endif
+
+; --------------------------------------------------------------------------
+; Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC ; -------------------------------------------
+
+%ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+ SECTION SEG_CONST
+const_base:
+
+%define GOTOFF(got, sym) (got) + (sym) - const_base
+
+%imacro get_GOT 1
+ ; NOTE: this macro destroys ecx resister.
+ call %%geteip
+ add ecx, byte (%%ref - $)
+ jmp short %%adjust
+%%geteip:
+ mov ecx, POINTER [esp]
+ ret
+%%adjust:
+ push ebp
+ xor ebp, ebp ; ebp = 0
+%ifidni %1, ebx ; (%1 == ebx)
+ ; db 0x8D,0x9C + jmp near const_base =
+ ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+ db 0x8D, 0x9C ; 8D,9C
+ jmp near const_base ; E9,(const_base-%%ref)
+%%ref:
+%else ; (%1 != ebx)
+ ; db 0x8D,0x8C + jmp near const_base =
+ ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+ db 0x8D, 0x8C ; 8D,8C
+ jmp near const_base ; E9,(const_base-%%ref)
+%%ref:
+ mov %1, ecx
+%endif ; (%1 == ebx)
+ pop ebp
+%endmacro
+
+%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT 1
+ extern GOT_SYMBOL
+ call %%geteip
+ add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+ jmp short %%done
+%%geteip:
+ mov %1, POINTER [esp]
+ ret
+%%done:
+%endmacro
+
+%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic 1.nolist
+ push %1
+%endmacro
+%imacro poppic 1.nolist
+ pop %1
+%endmacro
+%imacro movpic 2.nolist
+ mov %1, %2
+%endmacro
+
+%else ; !PIC -----------------------------------------
+
+%define GOTOFF(got, sym) (sym)
+
+%imacro get_GOT 1.nolist
+%endmacro
+%imacro pushpic 1.nolist
+%endmacro
+%imacro poppic 1.nolist
+%endmacro
+%imacro movpic 2.nolist
+%endmacro
+
+%endif ; PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+; Align the next instruction on {2,4,8,16,..}-byte boundary.
+; ".balign n,,m" in GNU as
+;
+%define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b, n) (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs: \
+ times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
+ db 0x90 ; nop
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
+ db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
+ db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
+ db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
+ db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
+ db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
+ db 0x8B, 0xED ; mov ebp,ebp
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
+ db 0x90 ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+ align %1, db 0 ; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+
+%ifdef WIN64
+
+%imacro collect_args 1
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm6
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm7
+ mov r10, rcx
+%if %1 > 1
+ mov r11, rdx
+%endif
+%if %1 > 2
+ push r12
+ mov r12, r8
+%endif
+%if %1 > 3
+ push r13
+ mov r13, r9
+%endif
+%if %1 > 4
+ push r14
+ mov r14, [rax+48]
+%endif
+%if %1 > 5
+ push r15
+ mov r15, [rax+56]
+%endif
+ push rsi
+ push rdi
+%endmacro
+
+%imacro uncollect_args 1
+ pop rdi
+ pop rsi
+%if %1 > 5
+ pop r15
+%endif
+%if %1 > 4
+ pop r14
+%endif
+%if %1 > 3
+ pop r13
+%endif
+%if %1 > 2
+ pop r12
+%endif
+ movaps xmm7, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
+ movaps xmm6, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
+%endmacro
+
+%imacro push_xmm 1
+ sub rsp, %1 * SIZEOF_XMMWORD
+ movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
+%if %1 > 1
+ movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
+%endif
+%if %1 > 2
+ movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
+%endif
+%if %1 > 3
+ movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
+%endif
+%endmacro
+
+%imacro pop_xmm 1
+ movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+%if %1 > 1
+ movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+%endif
+%if %1 > 2
+ movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+%endif
+%if %1 > 3
+ movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+%endif
+ add rsp, %1 * SIZEOF_XMMWORD
+%endmacro
+
+%else
+
+%imacro collect_args 1
+ push r10
+ mov r10, rdi
+%if %1 > 1
+ push r11
+ mov r11, rsi
+%endif
+%if %1 > 2
+ push r12
+ mov r12, rdx
+%endif
+%if %1 > 3
+ push r13
+ mov r13, rcx
+%endif
+%if %1 > 4
+ push r14
+ mov r14, r8
+%endif
+%if %1 > 5
+ push r15
+ mov r15, r9
+%endif
+%endmacro
+
+%imacro uncollect_args 1
+%if %1 > 5
+ pop r15
+%endif
+%if %1 > 4
+ pop r14
+%endif
+%if %1 > 3
+ pop r13
+%endif
+%if %1 > 2
+ pop r12
+%endif
+%if %1 > 1
+ pop r11
+%endif
+ pop r10
+%endmacro
+
+%imacro push_xmm 1
+%endmacro
+
+%imacro pop_xmm 1
+%endmacro
+
+%endif
+
+%endif
+
+; --------------------------------------------------------------------------
+; Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jccolext-altivec.c b/media/libjpeg/simd/powerpc/jccolext-altivec.c
index 849825eb06..170f90ff80 100644
--- a/media/libjpeg/simd/jccolext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jccolext-altivec.c
@@ -24,9 +24,9 @@
/* This file is included by jccolor-altivec.c */
-void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
{
JSAMPROW inptr, outptr0, outptr1, outptr2;
int pitch = img_width * RGB_PIXELSIZE, num_cols;
@@ -35,13 +35,13 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
#endif
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
- __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+ __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
- __vector unsigned char rgb3 = {0};
+ __vector unsigned char rgb3 = { 0 };
#endif
#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
- __vector unsigned char rgb4 = {0};
+ __vector unsigned char rgb4 = { 0 };
#endif
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
__vector unsigned short yl, yh, crl, crh, cbl, cbh;
@@ -57,9 +57,11 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
__vector unsigned char pb_zero = { __16X(0) },
#if __BIG_ENDIAN__
- shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+ shift_pack_index =
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
#else
- shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+ shift_pack_index =
+ { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
#endif
while (--num_rows >= 0) {
diff --git a/media/libjpeg/simd/powerpc/jccolor-altivec.c b/media/libjpeg/simd/powerpc/jccolor-altivec.c
new file mode 100644
index 0000000000..d670dbcda3
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jccolor-altivec.c
@@ -0,0 +1,116 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081 5329 /* FIX(0.08131) */
+#define F_0_114 7471 /* FIX(0.11400) */
+#define F_0_168 11059 /* FIX(0.16874) */
+#define F_0_250 16384 /* FIX(0.25000) */
+#define F_0_299 19595 /* FIX(0.29900) */
+#define F_0_331 21709 /* FIX(0.33126) */
+#define F_0_418 27439 /* FIX(0.41869) */
+#define F_0_500 32768 /* FIX(0.50000) */
+#define F_0_587 38470 /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+ { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
+#define RGBG_INDEX1 \
+ { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+ { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+ { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+ { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
+#define RGBG_INDEX1 \
+ { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+ { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+ { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+ { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+ { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+ { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jcgray-altivec.c b/media/libjpeg/simd/powerpc/jcgray-altivec.c
new file mode 100644
index 0000000000..a11a7e7021
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcgray-altivec.c
@@ -0,0 +1,111 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114 7471 /* FIX(0.11400) */
+#define F_0_250 16384 /* FIX(0.25000) */
+#define F_0_299 19595 /* FIX(0.29900) */
+#define F_0_587 38470 /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+ { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
+#define RGBG_INDEX1 \
+ { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+ { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+ { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+ { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
+#define RGBG_INDEX1 \
+ { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+ { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+ { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+ { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+ { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+ { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/media/libjpeg/simd/jcgryext-altivec.c b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
index 7f8232bb24..b280cbbded 100644
--- a/media/libjpeg/simd/jcgryext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
@@ -24,10 +24,9 @@
/* This file is included by jcgray-altivec.c */
-void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
- JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
+void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
{
JSAMPROW inptr, outptr;
int pitch = img_width * RGB_PIXELSIZE, num_cols;
@@ -36,13 +35,13 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
#endif
- __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+ __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
rgbg0, rgbg1, rgbg2, rgbg3, y;
#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
- __vector unsigned char rgb3 = {0};
+ __vector unsigned char rgb3 = { 0 };
#endif
#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
- __vector unsigned char rgb4 = {0};
+ __vector unsigned char rgb4 = { 0 };
#endif
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
__vector unsigned short yl, yh;
@@ -54,9 +53,11 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
__vector int pd_onehalf = { __4X(ONE_HALF) };
__vector unsigned char pb_zero = { __16X(0) },
#if __BIG_ENDIAN__
- shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+ shift_pack_index =
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
#else
- shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+ shift_pack_index =
+ { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
#endif
while (--num_rows >= 0) {
diff --git a/media/libjpeg/simd/jcsample-altivec.c b/media/libjpeg/simd/powerpc/jcsample-altivec.c
index 11609d9dab..6e25b8db90 100644
--- a/media/libjpeg/simd/jcsample-altivec.c
+++ b/media/libjpeg/simd/powerpc/jcsample-altivec.c
@@ -26,14 +26,15 @@
#include "jcsample.h"
-void
-jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor,
- JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
+ int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data)
{
int outrow, outcol;
- JDIMENSION output_cols = width_blocks * DCTSIZE;
+ JDIMENSION output_cols = width_in_blocks * DCTSIZE;
JSAMPROW inptr, outptr;
__vector unsigned char this0, next0, out;
@@ -43,7 +44,7 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
__vector unsigned short pw_bias = { __4X2(0, 1) },
pw_one = { __8X(1) };
__vector unsigned char even_odd_index =
- {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
+ { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
pb_zero = { __16X(0) };
expand_right_edge(input_data, max_v_samp_factor, image_width,
@@ -83,13 +84,13 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
void
-jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor,
- JDIMENSION width_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
int inrow, outrow, outcol;
- JDIMENSION output_cols = width_blocks * DCTSIZE;
+ JDIMENSION output_cols = width_in_blocks * DCTSIZE;
JSAMPROW inptr0, inptr1, outptr;
__vector unsigned char this0, next0, this1, next1, out;
@@ -100,7 +101,7 @@ jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
__vector unsigned short pw_bias = { __4X2(1, 2) },
pw_two = { __8X(2) };
__vector unsigned char even_odd_index =
- { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+ { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
pb_zero = { __16X(0) };
expand_right_edge(input_data, max_v_samp_factor, image_width,
diff --git a/media/libjpeg/simd/powerpc/jcsample.h b/media/libjpeg/simd/powerpc/jcsample.h
new file mode 100644
index 0000000000..bd07fcc4ed
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+ JDIMENSION output_cols)
+{
+ register JSAMPROW ptr;
+ register JSAMPLE pixval;
+ register int count;
+ int row;
+ int numcols = (int)(output_cols - input_cols);
+
+ if (numcols > 0) {
+ for (row = 0; row < num_rows; row++) {
+ ptr = image_data[row] + input_cols;
+ pixval = ptr[-1];
+ for (count = numcols; count > 0; count--)
+ *ptr++ = pixval;
+ }
+ }
+}
diff --git a/media/libjpeg/simd/jdcolext-altivec.c b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
index fb121ce745..68d52bd8a2 100644
--- a/media/libjpeg/simd/jdcolext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
@@ -23,9 +23,9 @@
/* This file is included by jdcolor-altivec.c */
-void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
- JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows)
+void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
{
JSAMPROW outptr, inptr0, inptr1, inptr2;
int pitch = out_width * RGB_PIXELSIZE, num_cols;
@@ -61,9 +61,11 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
__vector int pd_onehalf = { __4X(ONE_HALF) };
__vector unsigned char pb_zero = { __16X(0) },
#if __BIG_ENDIAN__
- shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+ shift_pack_index =
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
#else
- shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+ shift_pack_index =
+ { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
#endif
while (--num_rows >= 0) {
diff --git a/media/libjpeg/simd/powerpc/jdcolor-altivec.c b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
new file mode 100644
index 0000000000..eb35b67176
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
@@ -0,0 +1,106 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554 /* FIX(0.34414) */
+#define F_0_714 46802 /* FIX(0.71414) */
+#define F_1_402 91881 /* FIX(1.40200) */
+#define F_1_772 116130 /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+ { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+ { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+ { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+ { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+ { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+ { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+ { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+ { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+ { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+ { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jdmerge-altivec.c b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
new file mode 100644
index 0000000000..79c577f141
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
@@ -0,0 +1,130 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554 /* FIX(0.34414) */
+#define F_0_714 46802 /* FIX(0.71414) */
+#define F_1_402 91881 /* FIX(1.40200) */
+#define F_1_772 116130 /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+ { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+ { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+ { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+ { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extrgbx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extrgbx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+ { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+ { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+ { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+ { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extbgrx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extbgrx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+ { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extxbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extxbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+ { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extxrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extxrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/media/libjpeg/simd/jdmrgext-altivec.c b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
index 55205bb1f9..40f02c33ea 100644
--- a/media/libjpeg/simd/jdmrgext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
@@ -23,10 +23,10 @@
/* This file is included by jdmerge-altivec.c */
-void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
{
JSAMPROW outptr, inptr0, inptr1, inptr2;
int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
@@ -63,13 +63,19 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
__vector int pd_onehalf = { __4X(ONE_HALF) };
__vector unsigned char pb_zero = { __16X(0) },
#if __BIG_ENDIAN__
- shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
- even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
- odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
+ shift_pack_index =
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+ even_index =
+ { 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30 },
+ odd_index =
+ { 0, 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31 };
#else
- shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
- even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
- odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
+ shift_pack_index =
+ { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+ even_index =
+ { 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30, 0 },
+ odd_index =
+ { 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31, 0 };
#endif
inptr0 = input_buf[0][in_row_group_ctr];
@@ -299,10 +305,10 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
}
-void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
+void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
{
JSAMPROW inptr, outptr;
diff --git a/media/libjpeg/simd/jdsample-altivec.c b/media/libjpeg/simd/powerpc/jdsample-altivec.c
index b40ce55c89..04df0cf108 100644
--- a/media/libjpeg/simd/jdsample-altivec.c
+++ b/media/libjpeg/simd/powerpc/jdsample-altivec.c
@@ -25,31 +25,36 @@
#include "jsimd_altivec.h"
-void
-jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
- JDIMENSION downsampled_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
+void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
int inrow, incol;
- __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
+ __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
out;
__vector short this0e, this0o, this0l, this0h, last0l, last0h,
next0l, next0h, outle, outhe, outlo, outho;
/* Constants */
__vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
- last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
- last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
- next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
- next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
+ last_index_col0 =
+ { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+ last_index =
+ { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
+ next_index =
+ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 },
+ next_index_lastcol =
+ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 },
#if __BIG_ENDIAN__
- merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+ merge_pack_index =
+ { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
#else
- merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+ merge_pack_index =
+ { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
#endif
__vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
@@ -121,11 +126,10 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
}
-void
-jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
- JDIMENSION downsampled_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
+void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
@@ -136,21 +140,27 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
lastcolsum_1h, lastcolsum1h,
p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
- nextcolsum_1l = {0}, nextcolsum_1h = {0},
- nextcolsum1l = {0}, nextcolsum1h = {0},
+ nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
+ nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
tmpl, tmph, outle, outhe, outlo, outho;
/* Constants */
__vector unsigned char pb_zero = { __16X(0) },
- last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
- last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
- next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
- next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
+ last_index_col0 =
+ { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+ last_index =
+ { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+ next_index =
+ { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 },
+ next_index_lastcol =
+ { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 },
#if __BIG_ENDIAN__
- merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+ merge_pack_index =
+ { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
#else
- merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+ merge_pack_index =
+ { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
#endif
__vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
@@ -306,11 +316,10 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
/* These are rarely used (mainly just for decompressing YCCK images) */
-void
-jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
- JDIMENSION output_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
+void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
+ JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
@@ -345,11 +354,10 @@ jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
}
-void
-jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
- JDIMENSION output_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
+void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
+ JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr0, outptr1;
diff --git a/media/libjpeg/simd/jfdctfst-altivec.c b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
index 04157f77ea..ad9af81e0c 100644
--- a/media/libjpeg/simd/jfdctfst-altivec.c
+++ b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
@@ -32,64 +32,62 @@
#include "jsimd_altivec.h"
-#define F_0_382 98 /* FIX(0.382683433) */
-#define F_0_541 139 /* FIX(0.541196100) */
-#define F_0_707 181 /* FIX(0.707106781) */
-#define F_1_306 334 /* FIX(1.306562965) */
+#define F_0_382 98 /* FIX(0.382683433) */
+#define F_0_541 139 /* FIX(0.541196100) */
+#define F_0_707 181 /* FIX(0.707106781) */
+#define F_1_306 334 /* FIX(1.306562965) */
-#define CONST_BITS 8
-#define PRE_MULTIPLY_SCALE_BITS 2
-#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+#define CONST_BITS 8
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
-#define DO_FDCT() \
-{ \
- /* Even part */ \
+#define DO_FDCT() { \
+ /* Even part */ \
\
- tmp10 = vec_add(tmp0, tmp3); \
- tmp13 = vec_sub(tmp0, tmp3); \
- tmp11 = vec_add(tmp1, tmp2); \
- tmp12 = vec_sub(tmp1, tmp2); \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
\
- out0 = vec_add(tmp10, tmp11); \
- out4 = vec_sub(tmp10, tmp11); \
+ out0 = vec_add(tmp10, tmp11); \
+ out4 = vec_sub(tmp10, tmp11); \
\
- z1 = vec_add(tmp12, tmp13); \
- z1 = vec_sl(z1, pre_multiply_scale_bits); \
- z1 = vec_madds(z1, pw_0707, pw_zero); \
+ z1 = vec_add(tmp12, tmp13); \
+ z1 = vec_sl(z1, pre_multiply_scale_bits); \
+ z1 = vec_madds(z1, pw_0707, pw_zero); \
\
- out2 = vec_add(tmp13, z1); \
- out6 = vec_sub(tmp13, z1); \
+ out2 = vec_add(tmp13, z1); \
+ out6 = vec_sub(tmp13, z1); \
\
- /* Odd part */ \
+ /* Odd part */ \
\
- tmp10 = vec_add(tmp4, tmp5); \
- tmp11 = vec_add(tmp5, tmp6); \
- tmp12 = vec_add(tmp6, tmp7); \
+ tmp10 = vec_add(tmp4, tmp5); \
+ tmp11 = vec_add(tmp5, tmp6); \
+ tmp12 = vec_add(tmp6, tmp7); \
\
- tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
- tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
- z5 = vec_sub(tmp10, tmp12); \
- z5 = vec_madds(z5, pw_0382, pw_zero); \
+ tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
+ tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+ z5 = vec_sub(tmp10, tmp12); \
+ z5 = vec_madds(z5, pw_0382, pw_zero); \
\
- z2 = vec_madds(tmp10, pw_0541, z5); \
- z4 = vec_madds(tmp12, pw_1306, z5); \
+ z2 = vec_madds(tmp10, pw_0541, z5); \
+ z4 = vec_madds(tmp12, pw_1306, z5); \
\
- tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
- z3 = vec_madds(tmp11, pw_0707, pw_zero); \
+ tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+ z3 = vec_madds(tmp11, pw_0707, pw_zero); \
\
- z11 = vec_add(tmp7, z3); \
- z13 = vec_sub(tmp7, z3); \
+ z11 = vec_add(tmp7, z3); \
+ z13 = vec_sub(tmp7, z3); \
\
- out5 = vec_add(z13, z2); \
- out3 = vec_sub(z13, z2); \
- out1 = vec_add(z11, z4); \
- out7 = vec_sub(z11, z4); \
+ out5 = vec_add(z13, z2); \
+ out3 = vec_sub(z13, z2); \
+ out1 = vec_add(z11, z4); \
+ out7 = vec_sub(z11, z4); \
}
-void
-jsimd_fdct_ifast_altivec (DCTELEM *data)
+void jsimd_fdct_ifast_altivec(DCTELEM *data)
{
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
col0, col1, col2, col3, col4, col5, col6, col7,
diff --git a/media/libjpeg/simd/powerpc/jfdctint-altivec.c b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
new file mode 100644
index 0000000000..3d4f017103
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
@@ -0,0 +1,258 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+ /* (Original) \
+ * z1 = (tmp12 + tmp13) * 0.541196100; \
+ * data2 = z1 + tmp13 * 0.765366865; \
+ * data6 = z1 + tmp12 * -1.847759065; \
+ * \
+ * (This implementation) \
+ * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+ * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+ */ \
+ \
+ tmp1312l = vec_mergeh(tmp13, tmp12); \
+ tmp1312h = vec_mergel(tmp13, tmp12); \
+ \
+ out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
+ out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
+ out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
+ out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
+ \
+ out2l = vec_sra(out2l, descale_p##PASS); \
+ out2h = vec_sra(out2h, descale_p##PASS); \
+ out6l = vec_sra(out6l, descale_p##PASS); \
+ out6h = vec_sra(out6h, descale_p##PASS); \
+ \
+ out2 = vec_pack(out2l, out2h); \
+ out6 = vec_pack(out6l, out6h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = vec_add(tmp4, tmp6); \
+ z4 = vec_add(tmp5, tmp7); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = vec_mergeh(z3, z4); \
+ z34h = vec_mergel(z3, z4); \
+ \
+ z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
+ z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
+ z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
+ z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
+ \
+ /* (Original) \
+ * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
+ * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
+ * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
+ * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+ * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+ * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+ * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+ * data7 = tmp4 + z3; data5 = tmp5 + z4; \
+ * data3 = tmp6 + z3; data1 = tmp7 + z4; \
+ */ \
+ \
+ tmp47l = vec_mergeh(tmp4, tmp7); \
+ tmp47h = vec_mergel(tmp4, tmp7); \
+ \
+ out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
+ out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
+ out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
+ out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
+ \
+ out7l = vec_sra(out7l, descale_p##PASS); \
+ out7h = vec_sra(out7h, descale_p##PASS); \
+ out1l = vec_sra(out1l, descale_p##PASS); \
+ out1h = vec_sra(out1h, descale_p##PASS); \
+ \
+ out7 = vec_pack(out7l, out7h); \
+ out1 = vec_pack(out1l, out1h); \
+ \
+ tmp56l = vec_mergeh(tmp5, tmp6); \
+ tmp56h = vec_mergel(tmp5, tmp6); \
+ \
+ out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
+ out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
+ out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
+ out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
+ \
+ out5l = vec_sra(out5l, descale_p##PASS); \
+ out5h = vec_sra(out5h, descale_p##PASS); \
+ out3l = vec_sra(out3l, descale_p##PASS); \
+ out3h = vec_sra(out3h, descale_p##PASS); \
+ \
+ out5 = vec_pack(out5l, out5h); \
+ out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_sl(out0, pass1_bits); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_sl(out4, pass1_bits); \
+ \
+ DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() { \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_add(out0, pw_descale_p2x); \
+ out0 = vec_sra(out0, pass1_bits); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_add(out4, pw_descale_p2x); \
+ out4 = vec_sra(out4, pass1_bits); \
+ \
+ DO_FDCT_COMMON(2); \
+}
+
+
+void jsimd_fdct_islow_altivec(DCTELEM *data)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+ z3, z4, z34l, z34h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector int z3l, z3h, z4l, z4h,
+ out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+ out7l, out7h;
+
+ /* Constants */
+ __vector short
+ pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+ pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+ pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+ pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+ pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+ pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+ pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+ pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+ pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+ __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+ __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+ pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+ __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+ descale_p2 = { __4X(DESCALE_P2) };
+
+ /* Pass 1: process rows */
+
+ row0 = vec_ld(0, data);
+ row1 = vec_ld(16, data);
+ row2 = vec_ld(32, data);
+ row3 = vec_ld(48, data);
+ row4 = vec_ld(64, data);
+ row5 = vec_ld(80, data);
+ row6 = vec_ld(96, data);
+ row7 = vec_ld(112, data);
+
+ TRANSPOSE(row, col);
+
+ tmp0 = vec_add(col0, col7);
+ tmp7 = vec_sub(col0, col7);
+ tmp1 = vec_add(col1, col6);
+ tmp6 = vec_sub(col1, col6);
+ tmp2 = vec_add(col2, col5);
+ tmp5 = vec_sub(col2, col5);
+ tmp3 = vec_add(col3, col4);
+ tmp4 = vec_sub(col3, col4);
+
+ DO_FDCT_PASS1();
+
+ /* Pass 2: process columns */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = vec_add(row0, row7);
+ tmp7 = vec_sub(row0, row7);
+ tmp1 = vec_add(row1, row6);
+ tmp6 = vec_sub(row1, row6);
+ tmp2 = vec_add(row2, row5);
+ tmp5 = vec_sub(row2, row5);
+ tmp3 = vec_add(row3, row4);
+ tmp4 = vec_sub(row3, row4);
+
+ DO_FDCT_PASS2();
+
+ vec_st(out0, 0, data);
+ vec_st(out1, 16, data);
+ vec_st(out2, 32, data);
+ vec_st(out3, 48, data);
+ vec_st(out4, 64, data);
+ vec_st(out5, 80, data);
+ vec_st(out6, 96, data);
+ vec_st(out7, 112, data);
+}
diff --git a/media/libjpeg/simd/jidctfst-altivec.c b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
index ec30c3995b..456c6c6174 100644
--- a/media/libjpeg/simd/jidctfst-altivec.c
+++ b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
@@ -32,87 +32,85 @@
#include "jsimd_altivec.h"
-#define F_1_082 277 /* FIX(1.082392200) */
-#define F_1_414 362 /* FIX(1.414213562) */
-#define F_1_847 473 /* FIX(1.847759065) */
-#define F_2_613 669 /* FIX(2.613125930) */
-#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
+#define F_1_082 277 /* FIX(1.082392200) */
+#define F_1_414 362 /* FIX(1.414213562) */
+#define F_1_847 473 /* FIX(1.847759065) */
+#define F_2_613 669 /* FIX(2.613125930) */
+#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
-#define CONST_BITS 8
-#define PASS1_BITS 2
-#define PRE_MULTIPLY_SCALE_BITS 2
-#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+#define CONST_BITS 8
+#define PASS1_BITS 2
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
-#define DO_IDCT(in) \
-{ \
- /* Even part */ \
+#define DO_IDCT(in) { \
+ /* Even part */ \
\
- tmp10 = vec_add(in##0, in##4); \
- tmp11 = vec_sub(in##0, in##4); \
- tmp13 = vec_add(in##2, in##6); \
+ tmp10 = vec_add(in##0, in##4); \
+ tmp11 = vec_sub(in##0, in##4); \
+ tmp13 = vec_add(in##2, in##6); \
\
- tmp12 = vec_sub(in##2, in##6); \
- tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
- tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
- tmp12 = vec_sub(tmp12, tmp13); \
+ tmp12 = vec_sub(in##2, in##6); \
+ tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+ tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
+ tmp12 = vec_sub(tmp12, tmp13); \
\
- tmp0 = vec_add(tmp10, tmp13); \
- tmp3 = vec_sub(tmp10, tmp13); \
- tmp1 = vec_add(tmp11, tmp12); \
- tmp2 = vec_sub(tmp11, tmp12); \
+ tmp0 = vec_add(tmp10, tmp13); \
+ tmp3 = vec_sub(tmp10, tmp13); \
+ tmp1 = vec_add(tmp11, tmp12); \
+ tmp2 = vec_sub(tmp11, tmp12); \
\
- /* Odd part */ \
+ /* Odd part */ \
\
- z13 = vec_add(in##5, in##3); \
- z10 = vec_sub(in##5, in##3); \
- z10s = vec_sl(z10, pre_multiply_scale_bits); \
- z11 = vec_add(in##1, in##7); \
- z12s = vec_sub(in##1, in##7); \
- z12s = vec_sl(z12s, pre_multiply_scale_bits); \
+ z13 = vec_add(in##5, in##3); \
+ z10 = vec_sub(in##5, in##3); \
+ z10s = vec_sl(z10, pre_multiply_scale_bits); \
+ z11 = vec_add(in##1, in##7); \
+ z12s = vec_sub(in##1, in##7); \
+ z12s = vec_sl(z12s, pre_multiply_scale_bits); \
\
- tmp11 = vec_sub(z11, z13); \
- tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
- tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
+ tmp11 = vec_sub(z11, z13); \
+ tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+ tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
\
- tmp7 = vec_add(z11, z13); \
+ tmp7 = vec_add(z11, z13); \
\
- /* To avoid overflow... \
- * \
- * (Original) \
- * tmp12 = -2.613125930 * z10 + z5; \
- * \
- * (This implementation) \
- * tmp12 = (-1.613125930 - 1) * z10 + z5; \
- * = -1.613125930 * z10 - z10 + z5; \
- */ \
+ /* To avoid overflow... \
+ * \
+ * (Original) \
+ * tmp12 = -2.613125930 * z10 + z5; \
+ * \
+ * (This implementation) \
+ * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+ * = -1.613125930 * z10 - z10 + z5; \
+ */ \
\
- z5 = vec_add(z10s, z12s); \
- z5 = vec_madds(z5, pw_F1847, pw_zero); \
+ z5 = vec_add(z10s, z12s); \
+ z5 = vec_madds(z5, pw_F1847, pw_zero); \
\
- tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
- tmp10 = vec_sub(tmp10, z5); \
- tmp12 = vec_madds(z10s, pw_MF1613, z5); \
- tmp12 = vec_sub(tmp12, z10); \
+ tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
+ tmp10 = vec_sub(tmp10, z5); \
+ tmp12 = vec_madds(z10s, pw_MF1613, z5); \
+ tmp12 = vec_sub(tmp12, z10); \
\
- tmp6 = vec_sub(tmp12, tmp7); \
- tmp5 = vec_sub(tmp11, tmp6); \
- tmp4 = vec_add(tmp10, tmp5); \
+ tmp6 = vec_sub(tmp12, tmp7); \
+ tmp5 = vec_sub(tmp11, tmp6); \
+ tmp4 = vec_add(tmp10, tmp5); \
\
- out0 = vec_add(tmp0, tmp7); \
- out1 = vec_add(tmp1, tmp6); \
- out2 = vec_add(tmp2, tmp5); \
- out3 = vec_sub(tmp3, tmp4); \
- out4 = vec_add(tmp3, tmp4); \
- out5 = vec_sub(tmp2, tmp5); \
- out6 = vec_sub(tmp1, tmp6); \
- out7 = vec_sub(tmp0, tmp7); \
+ out0 = vec_add(tmp0, tmp7); \
+ out1 = vec_add(tmp1, tmp6); \
+ out2 = vec_add(tmp2, tmp5); \
+ out3 = vec_sub(tmp3, tmp4); \
+ out4 = vec_add(tmp3, tmp4); \
+ out5 = vec_sub(tmp2, tmp5); \
+ out6 = vec_sub(tmp1, tmp6); \
+ out7 = vec_sub(tmp0, tmp7); \
}
-void
-jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
{
short *dct_table = (short *)dct_table_;
int *outptr;
diff --git a/media/libjpeg/simd/jidctint-altivec.c b/media/libjpeg/simd/powerpc/jidctint-altivec.c
index 935f35d1eb..60e619f11d 100644
--- a/media/libjpeg/simd/jidctint-altivec.c
+++ b/media/libjpeg/simd/powerpc/jidctint-altivec.c
@@ -1,7 +1,7 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
- * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2014-2015, 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -20,194 +20,192 @@
* 3. This notice may not be removed or altered from any source distribution.
*/
-/* SLOW INTEGER INVERSE DCT */
+/* ACCURATE INTEGER INVERSE DCT */
#include "jsimd_altivec.h"
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
-
-
-#define DO_IDCT(in, PASS) \
-{ \
- /* Even part \
- * \
- * (Original) \
- * z1 = (z2 + z3) * 0.541196100; \
- * tmp2 = z1 + z3 * -1.847759065; \
- * tmp3 = z1 + z2 * 0.765366865; \
- * \
- * (This implementation) \
- * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
- * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
- */ \
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) { \
+ /* Even part \
+ * \
+ * (Original) \
+ * z1 = (z2 + z3) * 0.541196100; \
+ * tmp2 = z1 + z3 * -1.847759065; \
+ * tmp3 = z1 + z2 * 0.765366865; \
+ * \
+ * (This implementation) \
+ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+ */ \
\
- in##26l = vec_mergeh(in##2, in##6); \
- in##26h = vec_mergel(in##2, in##6); \
+ in##26l = vec_mergeh(in##2, in##6); \
+ in##26h = vec_mergel(in##2, in##6); \
\
- tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
- tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
- tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
- tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
+ tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
+ tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
+ tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
+ tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
\
- tmp0 = vec_add(in##0, in##4); \
- tmp1 = vec_sub(in##0, in##4); \
+ tmp0 = vec_add(in##0, in##4); \
+ tmp1 = vec_sub(in##0, in##4); \
\
- tmp0l = vec_unpackh(tmp0); \
- tmp0h = vec_unpackl(tmp0); \
- tmp0l = vec_sl(tmp0l, const_bits); \
- tmp0h = vec_sl(tmp0h, const_bits); \
- tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
- tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
+ tmp0l = vec_unpackh(tmp0); \
+ tmp0h = vec_unpackl(tmp0); \
+ tmp0l = vec_sl(tmp0l, const_bits); \
+ tmp0h = vec_sl(tmp0h, const_bits); \
+ tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
+ tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
\
- tmp10l = vec_add(tmp0l, tmp3l); \
- tmp10h = vec_add(tmp0h, tmp3h); \
- tmp13l = vec_sub(tmp0l, tmp3l); \
- tmp13h = vec_sub(tmp0h, tmp3h); \
+ tmp10l = vec_add(tmp0l, tmp3l); \
+ tmp10h = vec_add(tmp0h, tmp3h); \
+ tmp13l = vec_sub(tmp0l, tmp3l); \
+ tmp13h = vec_sub(tmp0h, tmp3h); \
\
- tmp1l = vec_unpackh(tmp1); \
- tmp1h = vec_unpackl(tmp1); \
- tmp1l = vec_sl(tmp1l, const_bits); \
- tmp1h = vec_sl(tmp1h, const_bits); \
- tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
- tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
+ tmp1l = vec_unpackh(tmp1); \
+ tmp1h = vec_unpackl(tmp1); \
+ tmp1l = vec_sl(tmp1l, const_bits); \
+ tmp1h = vec_sl(tmp1h, const_bits); \
+ tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
+ tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
\
- tmp11l = vec_add(tmp1l, tmp2l); \
- tmp11h = vec_add(tmp1h, tmp2h); \
- tmp12l = vec_sub(tmp1l, tmp2l); \
- tmp12h = vec_sub(tmp1h, tmp2h); \
+ tmp11l = vec_add(tmp1l, tmp2l); \
+ tmp11h = vec_add(tmp1h, tmp2h); \
+ tmp12l = vec_sub(tmp1l, tmp2l); \
+ tmp12h = vec_sub(tmp1h, tmp2h); \
\
- /* Odd part */ \
+ /* Odd part */ \
\
- z3 = vec_add(in##3, in##7); \
- z4 = vec_add(in##1, in##5); \
+ z3 = vec_add(in##3, in##7); \
+ z4 = vec_add(in##1, in##5); \
\
- /* (Original) \
- * z5 = (z3 + z4) * 1.175875602; \
- * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
- * z3 += z5; z4 += z5; \
- * \
- * (This implementation) \
- * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
- * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
- */ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
\
- z34l = vec_mergeh(z3, z4); \
- z34h = vec_mergel(z3, z4); \
+ z34l = vec_mergeh(z3, z4); \
+ z34h = vec_mergel(z3, z4); \
\
- z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
- z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
- z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
- z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
+ z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
+ z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
+ z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
+ z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
\
- /* (Original) \
- * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
- * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
- * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
- * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
- * tmp0 += z1 + z3; tmp1 += z2 + z4; \
- * tmp2 += z2 + z3; tmp3 += z1 + z4; \
- * \
- * (This implementation) \
- * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
- * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
- * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
- * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
- * tmp0 += z3; tmp1 += z4; \
- * tmp2 += z3; tmp3 += z4; \
- */ \
+ /* (Original) \
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * tmp0 += z1 + z3; tmp1 += z2 + z4; \
+ * tmp2 += z2 + z3; tmp3 += z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+ * tmp0 += z3; tmp1 += z4; \
+ * tmp2 += z3; tmp3 += z4; \
+ */ \
\
- in##71l = vec_mergeh(in##7, in##1); \
- in##71h = vec_mergel(in##7, in##1); \
+ in##71l = vec_mergeh(in##7, in##1); \
+ in##71h = vec_mergel(in##7, in##1); \
\
- tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
- tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
- tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
- tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
+ tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
+ tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
+ tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
+ tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
\
- in##53l = vec_mergeh(in##5, in##3); \
- in##53h = vec_mergel(in##5, in##3); \
+ in##53l = vec_mergeh(in##5, in##3); \
+ in##53h = vec_mergel(in##5, in##3); \
\
- tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
- tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
- tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
- tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
+ tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
+ tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
+ tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
+ tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
\
- /* Final output stage */ \
+ /* Final output stage */ \
\
- out0l = vec_add(tmp10l, tmp3l); \
- out0h = vec_add(tmp10h, tmp3h); \
- out7l = vec_sub(tmp10l, tmp3l); \
- out7h = vec_sub(tmp10h, tmp3h); \
+ out0l = vec_add(tmp10l, tmp3l); \
+ out0h = vec_add(tmp10h, tmp3h); \
+ out7l = vec_sub(tmp10l, tmp3l); \
+ out7h = vec_sub(tmp10h, tmp3h); \
\
- out0l = vec_sra(out0l, descale_p##PASS); \
- out0h = vec_sra(out0h, descale_p##PASS); \
- out7l = vec_sra(out7l, descale_p##PASS); \
- out7h = vec_sra(out7h, descale_p##PASS); \
+ out0l = vec_sra(out0l, descale_p##PASS); \
+ out0h = vec_sra(out0h, descale_p##PASS); \
+ out7l = vec_sra(out7l, descale_p##PASS); \
+ out7h = vec_sra(out7h, descale_p##PASS); \
\
- out0 = vec_pack(out0l, out0h); \
- out7 = vec_pack(out7l, out7h); \
+ out0 = vec_pack(out0l, out0h); \
+ out7 = vec_pack(out7l, out7h); \
\
- out1l = vec_add(tmp11l, tmp2l); \
- out1h = vec_add(tmp11h, tmp2h); \
- out6l = vec_sub(tmp11l, tmp2l); \
- out6h = vec_sub(tmp11h, tmp2h); \
+ out1l = vec_add(tmp11l, tmp2l); \
+ out1h = vec_add(tmp11h, tmp2h); \
+ out6l = vec_sub(tmp11l, tmp2l); \
+ out6h = vec_sub(tmp11h, tmp2h); \
\
- out1l = vec_sra(out1l, descale_p##PASS); \
- out1h = vec_sra(out1h, descale_p##PASS); \
- out6l = vec_sra(out6l, descale_p##PASS); \
- out6h = vec_sra(out6h, descale_p##PASS); \
+ out1l = vec_sra(out1l, descale_p##PASS); \
+ out1h = vec_sra(out1h, descale_p##PASS); \
+ out6l = vec_sra(out6l, descale_p##PASS); \
+ out6h = vec_sra(out6h, descale_p##PASS); \
\
- out1 = vec_pack(out1l, out1h); \
- out6 = vec_pack(out6l, out6h); \
+ out1 = vec_pack(out1l, out1h); \
+ out6 = vec_pack(out6l, out6h); \
\
- out2l = vec_add(tmp12l, tmp1l); \
- out2h = vec_add(tmp12h, tmp1h); \
- out5l = vec_sub(tmp12l, tmp1l); \
- out5h = vec_sub(tmp12h, tmp1h); \
+ out2l = vec_add(tmp12l, tmp1l); \
+ out2h = vec_add(tmp12h, tmp1h); \
+ out5l = vec_sub(tmp12l, tmp1l); \
+ out5h = vec_sub(tmp12h, tmp1h); \
\
- out2l = vec_sra(out2l, descale_p##PASS); \
- out2h = vec_sra(out2h, descale_p##PASS); \
- out5l = vec_sra(out5l, descale_p##PASS); \
- out5h = vec_sra(out5h, descale_p##PASS); \
+ out2l = vec_sra(out2l, descale_p##PASS); \
+ out2h = vec_sra(out2h, descale_p##PASS); \
+ out5l = vec_sra(out5l, descale_p##PASS); \
+ out5h = vec_sra(out5h, descale_p##PASS); \
\
- out2 = vec_pack(out2l, out2h); \
- out5 = vec_pack(out5l, out5h); \
+ out2 = vec_pack(out2l, out2h); \
+ out5 = vec_pack(out5l, out5h); \
\
- out3l = vec_add(tmp13l, tmp0l); \
- out3h = vec_add(tmp13h, tmp0h); \
- out4l = vec_sub(tmp13l, tmp0l); \
- out4h = vec_sub(tmp13h, tmp0h); \
+ out3l = vec_add(tmp13l, tmp0l); \
+ out3h = vec_add(tmp13h, tmp0h); \
+ out4l = vec_sub(tmp13l, tmp0l); \
+ out4h = vec_sub(tmp13h, tmp0h); \
\
- out3l = vec_sra(out3l, descale_p##PASS); \
- out3h = vec_sra(out3h, descale_p##PASS); \
- out4l = vec_sra(out4l, descale_p##PASS); \
- out4h = vec_sra(out4h, descale_p##PASS); \
+ out3l = vec_sra(out3l, descale_p##PASS); \
+ out3h = vec_sra(out3h, descale_p##PASS); \
+ out4l = vec_sra(out4l, descale_p##PASS); \
+ out4h = vec_sra(out4h, descale_p##PASS); \
\
- out3 = vec_pack(out3l, out3h); \
- out4 = vec_pack(out4l, out4h); \
+ out3 = vec_pack(out3l, out3h); \
+ out4 = vec_pack(out4l, out4h); \
}
-void
-jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
+void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
{
short *dct_table = (short *)dct_table_;
int *outptr;
diff --git a/media/libjpeg/simd/jquanti-altivec.c b/media/libjpeg/simd/powerpc/jquanti-altivec.c
index 25cc296f7a..7d6e32542b 100644
--- a/media/libjpeg/simd/jquanti-altivec.c
+++ b/media/libjpeg/simd/powerpc/jquanti-altivec.c
@@ -31,26 +31,25 @@
*/
#if __BIG_ENDIAN__
-#define LOAD_ROW(row) { \
- elemptr = sample_data[row] + start_col; \
- in##row = vec_ld(0, elemptr); \
- if ((size_t)elemptr & 15) \
- in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
+#define LOAD_ROW(row) { \
+ elemptr = sample_data[row] + start_col; \
+ in##row = vec_ld(0, elemptr); \
+ if ((size_t)elemptr & 15) \
+ in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
}
#else
-#define LOAD_ROW(row) { \
- elemptr = sample_data[row] + start_col; \
- in##row = vec_vsx_ld(0, elemptr); \
+#define LOAD_ROW(row) { \
+ elemptr = sample_data[row] + start_col; \
+ in##row = vec_vsx_ld(0, elemptr); \
}
#endif
-void
-jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
+void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
{
JSAMPROW elemptr;
@@ -99,24 +98,23 @@ jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
}
-#define WORD_BIT 16
+#define WORD_BIT 16
/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
We basically need an unsigned equivalent of vec_madds(). */
-#define MULTIPLY(vs0, vs1, out) { \
- tmpe = vec_mule((__vector unsigned short)vs0, \
- (__vector unsigned short)vs1); \
- tmpo = vec_mulo((__vector unsigned short)vs0, \
- (__vector unsigned short)vs1); \
- out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
- (__vector unsigned short)tmpo, \
- shift_pack_index); \
+#define MULTIPLY(vs0, vs1, out) { \
+ tmpe = vec_mule((__vector unsigned short)vs0, \
+ (__vector unsigned short)vs1); \
+ tmpo = vec_mulo((__vector unsigned short)vs0, \
+ (__vector unsigned short)vs1); \
+ out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
+ (__vector unsigned short)tmpo, \
+ shift_pack_index); \
}
-void
-jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
+void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace)
{
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
@@ -129,10 +127,10 @@ jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
__vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
#if __BIG_ENDIAN__
__vector unsigned char shift_pack_index =
- {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
+ { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 };
#else
__vector unsigned char shift_pack_index =
- {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
+ { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
#endif
row0 = vec_ld(0, workspace);
diff --git a/media/libjpeg/simd/powerpc/jsimd.c b/media/libjpeg/simd/powerpc/jsimd.c
new file mode 100644
index 0000000000..b9e86dcfac
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jsimd.c
@@ -0,0 +1,881 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#ifdef __amigaos4__
+/* This must be defined first as it re-defines GLOBAL otherwise */
+#include <proto/exec.h>
+#endif
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#if defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#elif defined(__FreeBSD__)
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+#endif
+
+static unsigned int simd_support = ~0;
+
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+ char *p;
+
+ if (*feature == 0)
+ return 0;
+ if (strncmp(buffer, "cpu", 3) != 0)
+ return 0;
+ buffer += 3;
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'feature' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, feature))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(feature);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ simd_support = 0;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_feature(buffer, "altivec"))
+ simd_support |= JSIMD_ALTIVEC;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char *env = NULL;
+#endif
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#elif defined(__amigaos4__)
+ uint32 altivec = 0;
+#elif defined(__OpenBSD__)
+ int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+ int altivec;
+ size_t len = sizeof(altivec);
+#elif defined(__FreeBSD__)
+ unsigned long cpufeatures = 0;
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__ALTIVEC__) || defined(__APPLE__)
+ simd_support |= JSIMD_ALTIVEC;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#elif defined(__amigaos4__)
+ IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
+ if (altivec == VECTORTYPE_ALTIVEC)
+ simd_support |= JSIMD_ALTIVEC;
+#elif defined(__OpenBSD__)
+ if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
+ simd_support |= JSIMD_ALTIVEC;
+#elif defined(__FreeBSD__)
+ elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
+ if (cpufeatures & PPC_FEATURE_HAS_ALTIVEC)
+ simd_support |= JSIMD_ALTIVEC;
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCEALTIVEC");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = JSIMD_ALTIVEC;
+ env = getenv("JSIMD_FORCENONE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_extrgb_ycc_convert_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_extrgbx_ycc_convert_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_extbgr_ycc_convert_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_extbgrx_ycc_convert_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_extxbgr_ycc_convert_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_extxrgb_ycc_convert_altivec;
+ break;
+ default:
+ altivecfct = jsimd_rgb_ycc_convert_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_extrgb_gray_convert_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_extrgbx_gray_convert_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_extbgr_gray_convert_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_extbgrx_gray_convert_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_extxbgr_gray_convert_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_extxrgb_gray_convert_altivec;
+ break;
+ default:
+ altivecfct = jsimd_rgb_gray_convert_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_ycc_extrgb_convert_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_ycc_extrgbx_convert_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_ycc_extbgr_convert_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_ycc_extbgrx_convert_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_ycc_extxbgr_convert_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_ycc_extxrgb_convert_altivec;
+ break;
+ default:
+ altivecfct = jsimd_ycc_rgb_convert_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
+ break;
+ default:
+ altivecfct = jsimd_h2v2_merged_upsample_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
+ break;
+ default:
+ altivecfct = jsimd_h2v1_merged_upsample_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
diff --git a/media/libjpeg/simd/jsimd_altivec.h b/media/libjpeg/simd/powerpc/jsimd_altivec.h
index 62dbc5cdf0..e8bdb06a54 100644
--- a/media/libjpeg/simd/jsimd_altivec.h
+++ b/media/libjpeg/simd/powerpc/jsimd_altivec.h
@@ -21,28 +21,27 @@
*/
#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
#include <altivec.h>
/* Common code */
-#define __4X(a) a, a, a, a
-#define __4X2(a, b) a, b, a, b, a, b, a, b
-#define __8X(a) __4X(a), __4X(a)
-#define __16X(a) __8X(a), __8X(a)
+#define __4X(a) a, a, a, a
+#define __4X2(a, b) a, b, a, b, a, b, a, b
+#define __8X(a) __4X(a), __4X(a)
+#define __16X(a) __8X(a), __8X(a)
-#define TRANSPOSE(row, col) \
-{ \
- __vector short row04l, row04h, row15l, row15h, \
- row26l, row26h, row37l, row37h; \
- __vector short col01e, col01o, col23e, col23o, \
- col45e, col45o, col67e, col67o; \
+#define TRANSPOSE(row, col) { \
+ __vector short row04l, row04h, row15l, row15h, \
+ row26l, row26h, row37l, row37h; \
+ __vector short col01e, col01o, col23e, col23o, \
+ col45e, col45o, col67e, col67o; \
\
/* transpose coefficients (phase 1) */ \
row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
@@ -65,18 +64,18 @@
col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
\
/* transpose coefficients (phase 3) */ \
- col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
- col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
- col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
- col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
- col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
- col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
- col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
- col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
+ col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
+ col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
+ col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
+ col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
+ col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
+ col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
+ col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
+ col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
}
#ifndef min
-#define min(a,b) ((a) < (b) ? (a) : (b))
+#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
@@ -84,16 +83,16 @@
#if __BIG_ENDIAN__
-#define VEC_LD(a, b) vec_ld(a, b)
-#define VEC_ST(a, b, c) vec_st(a, b, c)
-#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
-#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
+#define VEC_LD(a, b) vec_ld(a, b)
+#define VEC_ST(a, b, c) vec_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
#else
-#define VEC_LD(a, b) vec_vsx_ld(a, b)
-#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
-#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
-#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
+#define VEC_LD(a, b) vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
#endif
diff --git a/media/libjpeg/simd/x86_64/jccolext-avx2.asm b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
new file mode 100644
index 0000000000..ffb527db00
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
@@ -0,0 +1,559 @@
+;
+; jccolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 8
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rdx
+ push rbx
+ push rdi
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+ mov rbxp, JSAMPROW [rbx] ; outptr1
+ mov rdxp, JSAMPROW [rdx] ; outptr2
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, byte [rsi+rcx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, word [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
+.column_ld4:
+ vmovd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ vmovd xmmF, XMM_DWORD [rsi+rcx]
+ vpslldq xmmA, xmmA, SIZEOF_DWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ vmovq xmmB, XMM_MMWORD [rsi+rcx]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ sub rcx, byte SIZEOF_XMMWORD
+ vmovdqu xmmB, XMM_MMWORD [rsi+rcx]
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ vpor ymmA, ymmB
+.column_ld32:
+ test cl, SIZEOF_YMMWORD
+ jz short .column_ld64
+ sub rcx, byte SIZEOF_YMMWORD
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+ test cl, 2*SIZEOF_YMMWORD
+ mov rcx, SIZEOF_YMMWORD
+ jz short .rgb_ycc_cnv
+ vmovdqa ymmB, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+ ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vmovdqu ymmC, ymmA
+ vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+ vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+ vmovdqa ymmG, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+ ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+ vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+ ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+ ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+ vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+ ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+ vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+ ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+ vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+ ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+ vmovdqa ymmD, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+ ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+ vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+ ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+ vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+ ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+ ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+ ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+ vmovdqa ymmE, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+ ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+ vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+ ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+ ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+ ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+ vpxor ymmH, ymmH, ymmH
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmB, ymmE
+ vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+ vmovdqa ymmF, ymmD
+ vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ vmovdqa xmmF, xmmA
+ vperm2i128 ymmF, ymmF, ymmF, 1
+ vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+ vpor ymmA, ymmA, ymmF
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_XMMWORD/2
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ mov rcx, SIZEOF_YMMWORD
+ jz short .rgb_ycc_cnv
+ vmovdqa ymmE, ymmA
+ vmovdqa ymmH, ymmF
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+ vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+ ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmB, ymmA
+ vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+ vmovdqa ymmB, ymmF
+ vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+ vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmD, ymmA
+ vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+ ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+ vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+ ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+ vmovdqa ymmC, ymmF
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+ ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+ ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+ vmovdqa ymmB, ymmA
+ vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+ vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+ ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+ vmovdqa ymmG, ymmD
+ vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+ ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+ vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+ ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+ vmovdqa ymmE, ymmA
+ vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vmovdqa ymmH, ymmB
+ vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+ ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+ ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+ vpxor ymmF, ymmF, ymmF
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmD, ymmB
+ vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+ vmovdqa ymmG, ymmE
+ vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vpunpcklbw ymmF, ymmF, ymmH
+ vpunpckhbw ymmH, ymmH, ymmH
+ vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+ vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+ ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE
+ vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO
+ vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE
+ vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO
+
+ vmovdqa ymm6, ymm1
+ vpunpcklwd ymm1, ymm1, ymm3
+ vpunpckhwd ymm6, ymm6, ymm3
+ vmovdqa ymm7, ymm1
+ vmovdqa ymm4, ymm6
+ vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ vpmaddwd ymm7, ymm7, [rel PW_MF016_MF033] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vpxor ymm1, ymm1, ymm1
+ vpxor ymm6, ymm6, ymm6
+ vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL
+ vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH
+ vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500)
+ vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500)
+
+ vmovdqa ymm5, [rel PD_ONEHALFM1_CJ] ; ymm5=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm7, ymm7, ymm1
+ vpaddd ymm4, ymm4, ymm6
+ vpaddd ymm7, ymm7, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH
+ vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO
+
+ vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE
+
+ vmovdqa ymm6, ymm0
+ vpunpcklwd ymm0, ymm0, ymm2
+ vpunpckhwd ymm6, ymm6, ymm2
+ vmovdqa ymm5, ymm0
+ vmovdqa ymm4, ymm6
+ vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ vpmaddwd ymm5, ymm5, [rel PW_MF016_MF033] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vpxor ymm0, ymm0, ymm0
+ vpxor ymm6, ymm6, ymm6
+ vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL
+ vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH
+ vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500)
+ vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500)
+
+ vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm5, ymm5, ymm0
+ vpaddd ymm4, ymm4, ymm6
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm4, ymm4, ymm1
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH
+ vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpor ymm5, ymm5, ymm7 ; ymm5=Cb
+ vmovdqu YMMWORD [rbx], ymm5 ; Save Cb
+
+ vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO
+ vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE
+ vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO
+
+ vmovdqa ymm4, ymm0
+ vpunpcklwd ymm0, ymm0, ymm3
+ vpunpckhwd ymm4, ymm4, ymm3
+ vmovdqa ymm7, ymm0
+ vmovdqa ymm5, ymm4
+ vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ vpmaddwd ymm7, ymm7, [rel PW_MF008_MF041] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF]
+
+ vpaddd ymm0, ymm0, YMMWORD [wk(4)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(5)]
+ vpaddd ymm0, ymm0, ymm3
+ vpaddd ymm4, ymm4, ymm3
+ vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
+ vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
+
+ vpxor ymm3, ymm3, ymm3
+ vpxor ymm4, ymm4, ymm4
+ vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL
+ vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH
+ vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500)
+ vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500)
+
+ vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm7, ymm7, ymm3
+ vpaddd ymm5, ymm5, ymm4
+ vpaddd ymm7, ymm7, ymm1
+ vpaddd ymm5, ymm5, ymm1
+ vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH
+ vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO
+
+ vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE
+
+ vmovdqa ymm4, ymm6
+ vpunpcklwd ymm6, ymm6, ymm2
+ vpunpckhwd ymm4, ymm4, ymm2
+ vmovdqa ymm1, ymm6
+ vmovdqa ymm5, ymm4
+ vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ vpmaddwd ymm1, ymm1, [rel PW_MF008_MF041] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF]
+
+ vpaddd ymm6, ymm6, YMMWORD [wk(6)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(7)]
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm4, ymm4, ymm2
+ vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
+ vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpor ymm6, ymm6, ymm0 ; ymm6=Y
+ vmovdqu YMMWORD [rdi], ymm6 ; Save Y
+
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm4, ymm4, ymm4
+ vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL
+ vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH
+ vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500)
+ vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500)
+
+ vmovdqa ymm0, [rel PD_ONEHALFM1_CJ] ; ymm0=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm1, ymm1, ymm2
+ vpaddd ymm5, ymm5, ymm4
+ vpaddd ymm1, ymm1, ymm0
+ vpaddd ymm5, ymm5, ymm0
+ vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH
+ vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpor ymm1, ymm1, ymm7 ; ymm1=Cr
+ vmovdqu YMMWORD [rdx], ymm1 ; Save Cr
+
+ sub rcx, byte SIZEOF_YMMWORD
+ add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
+ add rdi, byte SIZEOF_YMMWORD ; outptr0
+ add rbx, byte SIZEOF_YMMWORD ; outptr1
+ add rdx, byte SIZEOF_YMMWORD ; outptr2
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+ pop rbx
+ pop rdx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jccolext-sse2.asm b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
new file mode 100644
index 0000000000..af70ed6010
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
@@ -0,0 +1,484 @@
+;
+; jccolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 8
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rdx
+ push rbx
+ push rdi
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+ mov rbxp, JSAMPROW [rbx] ; outptr1
+ mov rdxp, JSAMPROW [rdx] ; outptr2
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, byte [rsi+rcx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, word [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
+.column_ld4:
+ movd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH, xmmH
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+ movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm6
+ pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor xmm1, xmm1
+ pxor xmm6, xmm6
+ punpcklwd xmm1, xmm5 ; xmm1=BOL
+ punpckhwd xmm6, xmm5 ; xmm6=BOH
+ psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
+
+ movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm1
+ paddd xmm4, xmm6
+ paddd xmm7, xmm5
+ paddd xmm4, xmm5
+ psrld xmm7, SCALEBITS ; xmm7=CbOL
+ psrld xmm4, SCALEBITS ; xmm4=CbOH
+ packssdw xmm7, xmm4 ; xmm7=CbO
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm6
+ pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor xmm0, xmm0
+ pxor xmm6, xmm6
+ punpcklwd xmm0, xmm1 ; xmm0=BEL
+ punpckhwd xmm6, xmm1 ; xmm6=BEH
+ psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
+
+ movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm5, xmm0
+ paddd xmm4, xmm6
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrld xmm5, SCALEBITS ; xmm5=CbEL
+ psrld xmm4, SCALEBITS ; xmm4=CbEH
+ packssdw xmm5, xmm4 ; xmm5=CbE
+
+ psllw xmm7, BYTE_BIT
+ por xmm5, xmm7 ; xmm5=Cb
+ movdqa XMMWORD [rbx], xmm5 ; Save Cb
+
+ movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ movdqa xmm7, xmm0
+ movdqa xmm5, xmm4
+ pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, XMMWORD [wk(4)]
+ paddd xmm4, XMMWORD [wk(5)]
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ punpcklwd xmm3, xmm1 ; xmm3=ROL
+ punpckhwd xmm4, xmm1 ; xmm4=ROH
+ psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
+
+ movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm3
+ paddd xmm5, xmm4
+ paddd xmm7, xmm1
+ paddd xmm5, xmm1
+ psrld xmm7, SCALEBITS ; xmm7=CrOL
+ psrld xmm5, SCALEBITS ; xmm5=CrOH
+ packssdw xmm7, xmm5 ; xmm7=CrO
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(6)]
+ paddd xmm4, XMMWORD [wk(7)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [rdi], xmm6 ; Save Y
+
+ pxor xmm2, xmm2
+ pxor xmm4, xmm4
+ punpcklwd xmm2, xmm3 ; xmm2=REL
+ punpckhwd xmm4, xmm3 ; xmm4=REH
+ psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
+
+ movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
+
+ paddd xmm1, xmm2
+ paddd xmm5, xmm4
+ paddd xmm1, xmm0
+ paddd xmm5, xmm0
+ psrld xmm1, SCALEBITS ; xmm1=CrEL
+ psrld xmm5, SCALEBITS ; xmm5=CrEH
+ packssdw xmm1, xmm5 ; xmm1=CrE
+
+ psllw xmm7, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Cr
+ movdqa XMMWORD [rdx], xmm1 ; Save Cr
+
+ sub rcx, byte SIZEOF_XMMWORD
+ add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add rdi, byte SIZEOF_XMMWORD ; outptr0
+ add rbx, byte SIZEOF_XMMWORD ; outptr1
+ add rdx, byte SIZEOF_XMMWORD ; outptr2
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+ pop rbx
+ pop rdx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop rbx
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jccolor-avx2.asm b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
new file mode 100644
index 0000000000..16b78298dc
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jccolor-sse2.asm b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
new file mode 100644
index 0000000000..e2955c2134
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-avx2.asm b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
new file mode 100644
index 0000000000..591255bb11
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-sse2.asm b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
new file mode 100644
index 0000000000..e389904f2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgryext-avx2.asm b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
new file mode 100644
index 0000000000..ddcc2c0a2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
@@ -0,0 +1,438 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rdi
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, byte [rsi+rcx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, word [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
+.column_ld4:
+ vmovd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ vmovd xmmF, XMM_DWORD [rsi+rcx]
+ vpslldq xmmA, xmmA, SIZEOF_DWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ vmovq xmmB, XMM_MMWORD [rsi+rcx]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ sub rcx, byte SIZEOF_XMMWORD
+ vmovdqu xmmB, XMM_MMWORD [rsi+rcx]
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ vpor ymmA, ymmB
+.column_ld32:
+ test cl, SIZEOF_YMMWORD
+ jz short .column_ld64
+ sub rcx, byte SIZEOF_YMMWORD
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+ test cl, 2*SIZEOF_YMMWORD
+ mov rcx, SIZEOF_YMMWORD
+ jz short .rgb_gray_cnv
+ vmovdqa ymmB, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_gray_cnv
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+ ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vmovdqu ymmC, ymmA
+ vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+ vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+ vmovdqa ymmG, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+ ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+ vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+ ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+ ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+ vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+ ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+ vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+ ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+ vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+ ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+ vmovdqa ymmD, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+ ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+ vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+ ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+ vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+ ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+ ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+ ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+ vmovdqa ymmE, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+ ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+ vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+ ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+ ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+ ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+ vpxor ymmH, ymmH, ymmH
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmB, ymmE
+ vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+ vmovdqa ymmF, ymmD
+ vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ vmovdqa xmmF, xmmA
+ vperm2i128 ymmF, ymmF, ymmF, 1
+ vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+ vpor ymmA, ymmA, ymmF
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_XMMWORD/2
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ mov rcx, SIZEOF_YMMWORD
+ jz short .rgb_gray_cnv
+ vmovdqa ymmE, ymmA
+ vmovdqa ymmH, ymmF
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_gray_cnv
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+ vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+ ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmB, ymmA
+ vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+ vmovdqa ymmB, ymmF
+ vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+ vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmD, ymmA
+ vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+ ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+ vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+ ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+ vmovdqa ymmC, ymmF
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+ ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+ ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+ vmovdqa ymmB, ymmA
+ vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+ vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+ ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+ vmovdqa ymmG, ymmD
+ vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+ ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+ vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+ ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+ vmovdqa ymmE, ymmA
+ vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vmovdqa ymmH, ymmB
+ vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+ ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+ ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+ vpxor ymmF, ymmF, ymmF
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmD, ymmB
+ vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+ vmovdqa ymmG, ymmE
+ vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vpunpcklbw ymmF, ymmF, ymmH
+ vpunpckhbw ymmH, ymmH, ymmH
+ vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+ vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+ ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ vmovdqa ymm6, ymm1
+ vpunpcklwd ymm1, ymm1, ymm3
+ vpunpckhwd ymm6, ymm6, ymm3
+ vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vmovdqa ymm6, ymm0
+ vpunpcklwd ymm0, ymm0, ymm2
+ vpunpckhwd ymm6, ymm6, ymm2
+ vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vmovdqa ymm0, ymm5 ; ymm0=BO
+ vmovdqa ymm6, ymm4 ; ymm6=BE
+
+ vmovdqa ymm4, ymm0
+ vpunpcklwd ymm0, ymm0, ymm3
+ vpunpckhwd ymm4, ymm4, ymm3
+ vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF]
+
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm4, ymm4, ymm7
+ vpaddd ymm0, ymm0, ymm3
+ vpaddd ymm4, ymm4, ymm3
+ vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
+ vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
+
+ vmovdqa ymm4, ymm6
+ vpunpcklwd ymm6, ymm6, ymm2
+ vpunpckhwd ymm4, ymm4, ymm2
+ vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF]
+
+ vpaddd ymm6, ymm6, YMMWORD [wk(0)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(1)]
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm4, ymm4, ymm2
+ vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
+ vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpor ymm6, ymm6, ymm0 ; ymm6=Y
+ vmovdqu YMMWORD [rdi], ymm6 ; Save Y
+
+ sub rcx, byte SIZEOF_YMMWORD
+ add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
+ add rdi, byte SIZEOF_YMMWORD ; outptr0
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jcgryext-sse2.asm b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
new file mode 100644
index 0000000000..f1d399a63b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
@@ -0,0 +1,363 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rdi
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, byte [rsi+rcx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, word [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
+.column_ld4:
+ movd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .rgb_gray_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH, xmmH
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa xmm0, xmm5 ; xmm0=BO
+ movdqa xmm6, xmm4 ; xmm6=BE
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, xmm1
+ paddd xmm4, xmm7
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(0)]
+ paddd xmm4, XMMWORD [wk(1)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [rdi], xmm6 ; Save Y
+
+ sub rcx, byte SIZEOF_XMMWORD
+ add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add rdi, byte SIZEOF_XMMWORD ; outptr0
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop rbx
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jchuff-sse2.asm b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
new file mode 100644
index 0000000000..9ea6df946e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
@@ -0,0 +1,583 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte: resp 1 ; => next byte to write in buffer
+.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
+.cur.free_bits resd 1 ; # of bits available in it
+.cur.last_dc_val resd 4 ; last DC coef for each component
+.cinfo: resp 1 ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco: resd 256 ; code for each symbol
+.ehufsi: resb 256 ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
+ dd 0x000f, 0x001f, 0x003f, 0x007f
+ dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
+ dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+ alignz 32
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 << 9 db 10
+times 1 << 8 db 9
+times 1 << 7 db 8
+times 1 << 6 db 7
+times 1 << 5 db 6
+times 1 << 4 db 5
+times 1 << 3 db 4
+times 1 << 2 db 3
+times 1 << 1 db 2
+times 1 << 0 db 1
+times 1 db 0
+jpeg_nbits_table:
+times 1 db 0
+times 1 << 0 db 1
+times 1 << 1 db 2
+times 1 << 2 db 3
+times 1 << 3 db 4
+times 1 << 4 db 5
+times 1 << 5 db 6
+times 1 << 6 db 7
+times 1 << 7 db 8
+times 1 << 8 db 9
+times 1 << 9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+times 1 << 15 db 16
+
+ alignz 32
+
+%define NBITS(x) nbits_base + x
+%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+; Shorthand used to describe SIMD operations:
+; wN: xmmN treated as eight signed 16-bit values
+; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
+; bN: xmmN treated as 16 unsigned 8-bit values
+; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - the label to which to jump when the macro completes
+; %2 (optional) - extra instructions to execute after nbits has been set
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits. temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 1-2
+ add nbitsb, free_bitsb ; nbits += free_bits;
+ neg free_bitsb ; free_bits = -free_bits;
+ mov tempd, code ; temp = code;
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ mov nbitsb, free_bitsb ; nbits = free_bits;
+ neg free_bitsb ; free_bits = -free_bits;
+ shr tempd, nbitsb ; temp >>= nbits;
+ or tempq, put_buffer ; temp |= put_buffer;
+ movq xmm0, tempq ; xmm0.u64 = { temp, 0 };
+ bswap tempq ; temp = htonl(temp);
+ mov put_buffer, codeq ; put_buffer = code;
+ pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
+ %2
+ pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i);
+ mov qword [buffer], tempq ; memcpy(buffer, &temp, 8);
+ ; (speculative; will be overwritten if
+ ; code contains any 0xFF bytes)
+ add free_bitsb, 64 ; free_bits += 64;
+ add bufferp, 8 ; buffer += 8;
+ test code, code ; if (code == 0) /* No 0xFF bytes */
+ jz %1 ; return;
+ ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+ ; bytes in the qword.
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer-7], 0 ; buffer[-7] = 0;
+ sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr tempq, 16 ; temp >>= 16;
+ mov byte [buffer], tempb ; buffer[0] = temp[0];
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr tempq, 16 ; temp >>= 16;
+ mov byte [buffer], tempb ; buffer[0] = temp[0];
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr tempd, 16 ; temp >>= 16;
+ mov byte [buffer], tempb ; buffer[0] = temp[0];
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ jmp %1 ; return;
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+; JCOEFPTR block, int last_dc_val,
+; c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel. In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support. The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.) This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; rax - buffer
+; rbx - temp
+; rcx - nbits
+; rdx - block --> free_bits
+; rsi - nbits_base
+; rdi - t
+; rbp - code
+; r8 - dctbl --> code_temp
+; r9 - actbl
+; r10 - state
+; r11 - index
+; r12 - put_buffer
+
+%define buffer rax
+%ifdef WIN64
+%define bufferp rax
+%else
+%define bufferp raxp
+%endif
+%define tempq rbx
+%define tempd ebx
+%define tempb bl
+%define temph bh
+%define nbitsq rcx
+%define nbits ecx
+%define nbitsb cl
+%define block rdx
+%define nbits_base rsi
+%define t rdi
+%define td edi
+%define codeq rbp
+%define code ebp
+%define dctbl r8
+%define actbl r9
+%define state r10
+%define index r11
+%define indexd r11d
+%define put_buffer r12
+%define put_bufferd r12d
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%ifdef WIN64
+
+; rcx = working_state *state
+; rdx = JOCTET *buffer
+; r8 = JCOEFPTR block
+; r9 = int last_dc_val
+; [rax+48] = c_derived_tbl *dctbl
+; [rax+56] = c_derived_tbl *actbl
+
+ ;X: X = code stream
+ mov buffer, rdx
+ mov block, r8
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
+ push rbx
+ push rbp
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
+ push rsi
+ push rdi
+ push r12
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
+ mov state, rcx
+ movsx code, word [block] ;Z: code = block[0];
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ sub code, r9d ;Z: code -= last_dc_val;
+ mov dctbl, POINTER [rsp+6*8+4*8]
+ mov actbl, POINTER [rsp+6*8+5*8]
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
+ lea nbits_base, [rel jpeg_nbits_table]
+ add rsp, -DCTSIZE2 * SIZEOF_WORD
+ mov t, rsp
+
+%else
+
+; rdi = working_state *state
+; rsi = JOCTET *buffer
+; rdx = JCOEFPTR block
+; rcx = int last_dc_val
+; r8 = c_derived_tbl *dctbl
+; r9 = c_derived_tbl *actbl
+
+ ;X: X = code stream
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
+ push rbx
+ push rbp
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
+ push r12
+ mov state, rdi
+ mov buffer, rsi
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
+ movsx codeq, word [block] ;Z: code = block[0];
+ lea nbits_base, [rel jpeg_nbits_table]
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ sub codeq, rcx ;Z: code -= last_dc_val;
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
+ lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_
+
+%endif
+
+ pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
+ pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
+ punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
+ punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
+ pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
+ ;A: (Row 0, offset 1)
+ pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+ paddw xmm0, xmm4 ;A: w0[i] += w4[i];
+ movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
+
+ movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
+ pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
+ pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
+ movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
+ movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
+ punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
+ pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
+ pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
+ ; (Row 1, offset 1)
+ pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+ paddw xmm1, xmm4 ;B: w1[i] += w4[i];
+ movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
+ pxor xmm4, xmm4 ;B: w4[i] = 0;
+ pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+ packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+ ; w/ signed saturation
+
+ pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
+ pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
+ pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
+ pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
+ ; (Row 3, offset 1)
+ pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+ paddw xmm3, xmm4 ;D: w3[i] += w4[i];
+ movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
+ pxor xmm4, xmm4 ;D: w4[i] = 0;
+ pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+ pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
+ cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000,
+ ;Z: i.e. if code is positive
+ pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
+ pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
+ adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0);
+ pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
+ pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
+ movsxd codeq, code ;Z: sign extend code
+ pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
+ ; (Row 2, offset 1)
+ pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+ paddw xmm2, xmm4 ;C: w2[i] += w4[i];
+ movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i];
+ pxor xmm4, xmm4 ;C: w4[i] = 0;
+ pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+
+ packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+ ; w/ signed saturation
+
+ movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code);
+ movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
+ pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i);
+ pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i);
+ movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
+ punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
+ shl tempd, 16 ;Z: temp <<= 16;
+ psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
+ pxor xmm2, xmm2 ;H: w2[i] = 0;
+ or put_bufferd, tempd ;Z: put_buffer |= temp;
+ pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
+ movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
+ unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
+ pxor xmm0, xmm0 ;H: w0[i] = 0;
+ pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
+ ; (Row 7, offset 1)
+ pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+ paddw xmm3, xmm2 ;H: w3[i] += w2[i];
+ movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
+ movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
+ pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+ punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
+ mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
+ ;Z: temp = dctbl->ehufco[nbits];
+ movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
+ psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
+ shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
+ and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1;
+ pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
+ pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
+ shl tempq, nbitsb ;Z: temp <<= nbits;
+ pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
+ pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
+ pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
+ or code, tempd ;Z: code |= temp;
+ movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
+ pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
+ pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
+ pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
+ ; (Row 6, offset 1)
+ pxor xmm2, xmm2 ;G: w2[i] = 0;
+ pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
+ paddw xmm4, xmm0 ;G: w4[i] += w0[i];
+ movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
+ pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
+ ; (Row 5, offset 1)
+ pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+ pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
+
+ packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+ ; w/ signed saturation
+
+ pxor xmm0, xmm0 ;F: w0[i] = 0;
+ pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
+ pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+ pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i);
+ pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
+ paddw xmm1, xmm2 ;F: w1[i] += w2[i];
+ movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
+ pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
+ ; (Row 4, offset 1)
+%undef block
+%define free_bitsq rdx
+%define free_bitsd edx
+%define free_bitsb dl
+ pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+ shl tempq, 48 ;Z: temp <<= 48;
+ pxor xmm2, xmm2 ;E: w2[i] = 0;
+ pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+ paddw xmm5, xmm0 ;E: w5[i] += w0[i];
+ or tempq, put_buffer ;Z: temp |= put_buffer;
+ movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
+ lea t, [dword t - 2] ;Z: t = &t[-1];
+ pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+ packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+ ; w/ signed saturation
+
+ add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
+ ;Z: nbits += dctbl->ehufsi[nbits];
+%undef dctbl
+%define code_temp r8d
+ pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i);
+ mov free_bitsd, [state+working_state.cur.free_bits]
+ ;Z: free_bits = state->cur.free_bits;
+ pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF;
+ shl index, 32 ;Z: index <<= 32;
+ mov put_buffer, [state+working_state.cur.put_buffer.simd]
+ ;Z: put_buffer = state->cur.put_buffer.simd;
+ or index, tempq ;Z: index |= temp;
+ not index ;Z: index = ~index;
+ sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0)
+ jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE;
+ align 16
+.EMIT_CODE: ;Z: .EMIT_CODE:
+ EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.BRLOOP: ; do {
+ lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16;
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+ ; nbits = actbl->ehufsi[0xf0];
+ mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+ ; code = actbl->ehufco[0xf0];
+ sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE;
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ mov nbits, code_temp ; nbits = code_temp;
+ or put_buffer, codeq ; put_buffer |= code;
+ cmp nbits, 16 ; if (nbits <= 16)
+ jle .ERLOOP ; break;
+ jmp .BRLOOP ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+ times 5 nop
+.ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE:
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ or put_buffer, codeq ; put_buffer |= code;
+.BLOOP_COND: ; .BLOOP_COND:
+ test index, index ; if (index != 0)
+ jz .ELOOP ; {
+.BLOOP: ; do {
+ xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */
+ tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index
+ inc nbits ; ++nbits;
+ lea t, [t + nbitsq * 2] ; t = &t[nbits];
+ shr index, nbitsb ; index >>= nbits;
+.EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END:
+ cmp nbits, 16 ; if (nbits > 16)
+ jg .BRLOOP ; goto .BRLOOP;
+.ERLOOP: ; .ERLOOP:
+ movsx codeq, word [t] ; code = *t;
+ lea tempd, [nbitsq * 2] ; temp = nbits * 2;
+ movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code);
+ lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits;
+ mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
+ ; code_temp = actbl->ehufco[temp-16];
+ shl code_temp, nbitsb ; code_temp <<= nbits;
+ and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1;
+ add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
+ ; free_bits -= actbl->ehufsi[temp-16];
+ or code, code_temp ; code |= code_temp;
+ sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_CODE ; goto .EMIT_CODE;
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ or put_buffer, codeq ; put_buffer |= code;
+ test index, index
+ jnz .BLOOP ; } while (index != 0);
+.ELOOP: ; } /* index != 0 */
+ sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
+%ifdef WIN64
+ cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
+%else
+ cmp td, -2 * SIZEOF_WORD ; if (t != -2)
+%endif
+ je .EFN ; {
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+ ; nbits = actbl->ehufsi[0];
+ mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0];
+ sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
+ jg .EFN_SKIP_EMIT_CODE ; {
+ EMIT_QWORD .EFN ; insert code, flush buffer
+ align 16
+.EFN_SKIP_EMIT_CODE: ; } else {
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ or put_buffer, codeq ; put_buffer |= code;
+.EFN: ; } }
+ mov [state + working_state.cur.put_buffer.simd], put_buffer
+ ; state->cur.put_buffer.simd = put_buffer;
+ mov byte [state + working_state.cur.free_bits], free_bitsb
+ ; state->cur.free_bits = free_bits;
+%ifdef WIN64
+ sub rsp, -DCTSIZE2 * SIZEOF_WORD
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+%else
+ pop r12
+ pop rbp
+ pop rbx
+%endif
+ ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_BRLOOP_CODE:
+ EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
+ ; insert code, flush buffer,
+ ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jcphuff-sse2.asm b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
new file mode 100644
index 0000000000..01b5c0235f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
@@ -0,0 +1,639 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
+; (64-bit SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding. See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+ pxor N0, N0
+ pxor N1, N1
+
+ mov T0d, INT [LUT + 0*SIZEOF_INT]
+ mov T1d, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0d, INT [LUT + 1*SIZEOF_INT]
+ mov T1d, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ mov T0d, INT [LUT + 2*SIZEOF_INT]
+ mov T1d, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ mov T0d, INT [LUT + 3*SIZEOF_INT]
+ mov T1d, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ mov T0d, INT [LUT + 4*SIZEOF_INT]
+ mov T1d, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ mov T0d, INT [LUT + 5*SIZEOF_INT]
+ mov T1d, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ mov T0d, INT [LUT + 6*SIZEOF_INT]
+ mov T1d, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+
+ mov T0d, INT [LUT + 7*SIZEOF_INT]
+ mov T1d, INT [LUT + 15*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+ pinsrw X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+ pxor N0, N0
+ pxor N1, N1
+ pxor X1, X1
+
+ mov T0d, INT [LUT + 0*SIZEOF_INT]
+ mov T1d, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0d, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0d, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0d, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0d, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0d, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0d, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0d, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+
+ cmp LENEND, 2
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+ pxor N0, N0
+
+ mov T0d, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+
+ mov T0d, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0d, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0d, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0d, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0d, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0d, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0d, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+ pxor N0, N0
+ pxor X0, X0
+
+ mov T1d, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 0
+
+ cmp LENEND, 2
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+ movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
+ movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
+ movdqa xmm2, XMMWORD [VALUES + (16*2)]
+ movdqa xmm3, XMMWORD [VALUES + (24*2)]
+ movdqa xmm4, XMMWORD [VALUES + (32*2)]
+ movdqa xmm5, XMMWORD [VALUES + (40*2)]
+ movdqa xmm6, XMMWORD [VALUES + (48*2)]
+ movdqa xmm7, XMMWORD [VALUES + (56*2)]
+
+ pcmpeqw xmm0, ZERO
+ pcmpeqw xmm1, ZERO
+ pcmpeqw xmm2, ZERO
+ pcmpeqw xmm3, ZERO
+ pcmpeqw xmm4, ZERO
+ pcmpeqw xmm5, ZERO
+ pcmpeqw xmm6, ZERO
+ pcmpeqw xmm7, ZERO
+
+ packsswb xmm0, xmm1
+ packsswb xmm2, xmm3
+ packsswb xmm4, xmm5
+ packsswb xmm6, xmm7
+
+ pmovmskb eax, xmm0
+ pmovmskb ecx, xmm2
+ pmovmskb edx, xmm4
+ pmovmskb esi, xmm6
+
+ shl rcx, 16
+ shl rdx, 32
+ shl rsi, 48
+
+ or rax, rcx
+ or rdx, rsi
+ or rax, rdx
+
+ not rax
+
+ mov MMWORD [r15], rax
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *values,
+; size_t *zerobits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *zerobits
+
+%define ZERO xmm9
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define LUT r11
+%define T0 rcx
+%define T0d ecx
+%define T1 rdx
+%define T1d edx
+%define BLOCK r10
+%define VALUES r14
+%define LEN r12d
+%define LENEND r13d
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [rbp - 16]
+ collect_args 6
+
+ movdqa XMMWORD [rbp - 16], ZERO
+
+ movd AL, r13d
+ pxor ZERO, ZERO
+ mov K, LEN
+ mov LENEND, LEN
+ and K, -16
+ and LENEND, 7
+ shr K, 4
+ jz .ELOOP16
+.BLOOP16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ dec K
+ jnz .BLOOP16
+ test LEN, 15
+ je .PADDING
+.ELOOP16:
+ test LEN, 8
+ jz .TRY7
+ test LEN, 7
+ jz .TRY8
+
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ jmp .PADDING
+.TRY8:
+ LOAD8
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+ jmp .PADDING
+.TRY7:
+ LOAD7
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+.PADDING:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDING
+ align 16
+.ZEROLOOP:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOP
+.EPADDING:
+ sub VALUES, DCTSIZE2*2
+
+ REDUCE0
+
+ movdqa ZERO, XMMWORD [rbp - 16]
+ uncollect_args 6
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *absvalues,
+; size_t *bits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *bits
+
+%define ZERO xmm9
+%define ONE xmm5
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define KK r9d
+%define EOB r8d
+%define SIGN rdi
+%define LUT r11
+%define T0 rcx
+%define T0d ecx
+%define T1 rdx
+%define T1d edx
+%define BLOCK r10
+%define VALUES r14
+%define LEN r12d
+%define LENEND r13d
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [rbp - 16]
+ collect_args 6
+
+ movdqa XMMWORD [rbp - 16], ZERO
+
+ xor SIGN, SIGN
+ xor EOB, EOB
+ xor KK, KK
+ movd AL, r13d
+ pxor ZERO, ZERO
+ pcmpeqw ONE, ONE
+ psrlw ONE, 15
+ mov K, LEN
+ mov LENEND, LEN
+ and K, -16
+ and LENEND, 7
+ shr K, 4
+ jz .ELOOPR16
+.BLOOPR16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 16 ; make room for sizebits
+ shl T0, 48
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER16 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER16:
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ add KK, 16
+ dec K
+ jnz .BLOOPR16
+ test LEN, 15
+ je .PADDINGR
+.ELOOPR16:
+ test LEN, 8
+ jz .TRYR7
+ test LEN, 7
+ jz .TRYR8
+
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 16 ; make room for sizebits
+ shl T0, 48
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER15 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER15:
+ add VALUES, 16*2
+ jmp .PADDINGR
+.TRYR8:
+ LOAD8
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 8 ; make room for sizebits
+ shl T0, 56
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER8 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER8:
+ add VALUES, 8*2
+ jmp .PADDINGR
+.TRYR7:
+ LOAD7
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 8 ; make room for sizebits
+ shl T0, 56
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER7 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER7:
+ add VALUES, 8*2
+.PADDINGR:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDINGR
+ align 16
+.ZEROLOOPR:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ shr SIGN, 8
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOPR
+.EPADDINGR:
+ not SIGN
+ sub VALUES, DCTSIZE2*2
+ mov MMWORD [r15+SIZEOF_MMWORD], SIGN
+
+ REDUCE0
+
+ mov eax, EOB
+ movdqa ZERO, XMMWORD [rbp - 16]
+ uncollect_args 6
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jcsample-avx2.asm b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
new file mode 100644
index 0000000000..b32527aebe
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
@@ -0,0 +1,367 @@
+;
+; jcsample.asm - downsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 6
+
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov edx, r10d
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdip, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, r12d ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov rdx, 0x00010000 ; bias pattern
+ vmovd xmm7, edx
+ vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+
+.columnloop_r24:
+ ; rcx can possibly be 8, 16, 24
+ cmp rcx, 24
+ jne .columnloop_r16
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r16:
+ cmp rcx, 16
+ jne .columnloop_r8
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vpxor ymm1, ymm1, ymm1
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r8:
+ vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
+ vpxor ymm1, ymm1, ymm1
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop:
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+ vpsrlw ymm2, ymm0, BYTE_BIT
+ vpand ymm0, ymm0, ymm6
+ vpsrlw ymm3, ymm1, BYTE_BIT
+ vpand ymm1, ymm1, ymm6
+
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm1, ymm1, ymm7
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+ sub rcx, byte SIZEOF_YMMWORD ; outcol
+ add rsi, byte 2*SIZEOF_YMMWORD ; inptr
+ add rdi, byte 1*SIZEOF_YMMWORD ; outptr
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+ test rcx, rcx
+ jnz near .columnloop_r24
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ uncollect_args 6
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 6
+
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov edx, r10d
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdip, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, r12d ; rowctr
+ test rax, rax
+ jle near .return
+
+ mov rdx, 0x00020001 ; bias pattern
+ vmovd xmm7, edx
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+ vperm2i128 ymm7, ymm7, ymm7, 0
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+
+.columnloop_r24:
+ cmp rcx, 24
+ jne .columnloop_r16
+ vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
+ vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r16:
+ cmp rcx, 16
+ jne .columnloop_r8
+ vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm3, ymm3, ymm3
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r8:
+ vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm3, ymm3, ymm3
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop:
+ vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+ vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+ vpand ymm4, ymm0, ymm6
+ vpsrlw ymm0, ymm0, BYTE_BIT
+ vpand ymm5, ymm1, ymm6
+ vpsrlw ymm1, ymm1, BYTE_BIT
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+
+ vpand ymm4, ymm2, ymm6
+ vpsrlw ymm2, ymm2, BYTE_BIT
+ vpand ymm5, ymm3, ymm6
+ vpsrlw ymm3, ymm3, BYTE_BIT
+ vpaddw ymm2, ymm2, ymm4
+ vpaddw ymm3, ymm3, ymm5
+
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm7
+ vpsrlw ymm0, ymm0, 2
+ vpsrlw ymm2, ymm2, 2
+
+ vpackuswb ymm0, ymm0, ymm2
+ vpermq ymm0, ymm0, 0xd8
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+ sub rcx, byte SIZEOF_YMMWORD ; outcol
+ add rdx, byte 2*SIZEOF_YMMWORD ; inptr0
+ add rsi, byte 2*SIZEOF_YMMWORD ; inptr1
+ add rdi, byte 1*SIZEOF_YMMWORD ; outptr
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .columnloop_r24
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ uncollect_args 6
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jcsample-sse2.asm b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
new file mode 100644
index 0000000000..2fcfe4567a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
@@ -0,0 +1,330 @@
+;
+; jcsample.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 6
+
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov edx, r10d
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdip, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, r12d ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov rdx, 0x00010000 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ pxor xmm1, xmm1
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .downsample
+
+.columnloop:
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ pand xmm0, xmm6
+ psrlw xmm2, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm3, BYTE_BIT
+
+ paddw xmm0, xmm2
+ paddw xmm1, xmm3
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ psrlw xmm0, 1
+ psrlw xmm1, 1
+
+ packuswb xmm0, xmm1
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+ sub rcx, byte SIZEOF_XMMWORD ; outcol
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ test rcx, rcx
+ jnz short .columnloop_r8
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args 6
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 6
+
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov edx, r10d
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdip, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, r12d ; rowctr
+ test rax, rax
+ jle near .return
+
+ mov rdx, 0x00020001 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ pxor xmm2, xmm2
+ pxor xmm3, xmm3
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .downsample
+
+.columnloop:
+ movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ pand xmm0, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ pand xmm2, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm3, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ paddw xmm0, xmm1
+ paddw xmm2, xmm3
+ paddw xmm0, xmm7
+ paddw xmm2, xmm7
+ psrlw xmm0, 2
+ psrlw xmm2, 2
+
+ packuswb xmm0, xmm2
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+ sub rcx, byte SIZEOF_XMMWORD ; outcol
+ add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
+ add rdi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .columnloop_r8
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args 6
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-avx2.asm b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
new file mode 100644
index 0000000000..2370fda642
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
@@ -0,0 +1,496 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d ; num_cols
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rdi, r13
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rax
+ push rdi
+ push rdx
+ push rbx
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr0
+ mov rbxp, JSAMPROW [rbx] ; inptr1
+ mov rdxp, JSAMPROW [rdx] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
+.columnloop:
+
+ vmovdqu ymm5, YMMWORD [rbx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+ vmovdqu ymm1, YMMWORD [rdx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm0, ymm0, ymm0
+ vpcmpeqw ymm7, ymm7, ymm7
+ vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+ vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+ vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+ vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+ vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+ vpaddw ymm2, ymm4, ymm7
+ vpaddw ymm3, ymm5, ymm7
+ vpaddw ymm6, ymm0, ymm7
+ vpaddw ymm7, ymm1, ymm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE
+ vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO
+ vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE
+ vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO
+
+ vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbE * -FIX(0.22800))
+ vpmulhw ymm5, ymm5, [rel PW_MF0228] ; ymm5=(2*CbO * -FIX(0.22800))
+ vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrE * FIX(0.40200))
+ vpmulhw ymm1, ymm1, [rel PW_F0402] ; ymm1=(2*CrO * FIX(0.40200))
+
+ vpaddw ymm4, ymm4, [rel PW_ONE]
+ vpaddw ymm5, ymm5, [rel PW_ONE]
+ vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800))
+ vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800))
+ vpaddw ymm0, ymm0, [rel PW_ONE]
+ vpaddw ymm1, ymm1, [rel PW_ONE]
+ vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200))
+ vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200))
+
+ vpaddw ymm4, ymm4, ymm2
+ vpaddw ymm5, ymm5, ymm3
+ vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+ vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+ vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+ vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E
+ vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O
+
+ vpunpckhwd ymm4, ymm2, ymm6
+ vpunpcklwd ymm2, ymm2, ymm6
+ vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285]
+ vpmaddwd ymm4, ymm4, [rel PW_MF0344_F0285]
+ vpunpckhwd ymm5, ymm3, ymm7
+ vpunpcklwd ymm3, ymm3, ymm7
+ vpmaddwd ymm3, ymm3, [rel PW_MF0344_F0285]
+ vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285]
+
+ vpaddd ymm2, ymm2, [rel PD_ONEHALF]
+ vpaddd ymm4, ymm4, [rel PD_ONEHALF]
+ vpsrad ymm2, ymm2, SCALEBITS
+ vpsrad ymm4, ymm4, SCALEBITS
+ vpaddd ymm3, ymm3, [rel PD_ONEHALF]
+ vpaddd ymm5, ymm5, [rel PD_ONEHALF]
+ vpsrad ymm3, ymm3, SCALEBITS
+ vpsrad ymm5, ymm5, SCALEBITS
+
+ vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ vmovdqu ymm5, YMMWORD [rsi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm4, ymm4, ymm4
+ vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+ vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE
+ vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+ vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+ vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+ vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
+ vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+ vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+ vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
+ vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+ vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+ vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
+ vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+ ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+ ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+ ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+ vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+ ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+ vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+ ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+ ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+ vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+ ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+ vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+ ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+ vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+ ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+ vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+ ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+ vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+ ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+ vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+ ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+ vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+ ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+ vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+ vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+ vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+ ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+ vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+ ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+ vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+ ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+ vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+ ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+ vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+ vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+ vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test rdi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr0
+ add rbx, byte SIZEOF_YMMWORD ; inptr1
+ add rdx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st64:
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_YMMWORD
+ jb short .column_st32
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmF
+ sub rcx, byte 2*SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st32:
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st31
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ add rdi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub rcx, byte SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st31:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ vmovq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ vpsrldq xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ vmovd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ vpsrldq xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ vmovd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ mov byte [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+ vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+ ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+ vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+ vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+ ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+ ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+ vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+ ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+ vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+ ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+ vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+ vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+ vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test rdi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+ vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+ vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+ add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr0
+ add rbx, byte SIZEOF_YMMWORD ; inptr1
+ add rdx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st64:
+ cmp rcx, byte SIZEOF_YMMWORD/2
+ jb short .column_st32
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmC
+ vmovdqa ymmD, ymmH
+ sub rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+ cmp rcx, byte SIZEOF_YMMWORD/4
+ jb short .column_st16
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ add rdi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+ cmp rcx, byte SIZEOF_YMMWORD/8
+ jb short .column_st15
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+ ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_YMMWORD/16
+ jb short .column_st7
+ vmovq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_YMMWORD/16*4
+ sub rcx, byte SIZEOF_YMMWORD/16
+ vpsrldq xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+ ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ vmovd XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+ pop rcx
+ pop rsi
+ pop rbx
+ pop rdx
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ add rdi, byte SIZEOF_JSAMPROW ; output_buf
+ dec rax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-sse2.asm b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
new file mode 100644
index 0000000000..e07c8d7518
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
@@ -0,0 +1,439 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d ; num_cols
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rdi, r13
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rax
+ push rdi
+ push rdx
+ push rbx
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr0
+ mov rbxp, JSAMPROW [rbx] ; inptr1
+ mov rdxp, JSAMPROW [rdx] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
+.columnloop:
+
+ movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
+ movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm7, xmm7
+ psrlw xmm4, BYTE_BIT
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+ movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
+ psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
+ pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
+ psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
+
+ paddw xmm4, xmm7
+ paddw xmm5, xmm7
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm2, xmm4 ; xmm2=CbE
+ movdqa xmm3, xmm5 ; xmm3=CbO
+ paddw xmm4, xmm4 ; xmm4=2*CbE
+ paddw xmm5, xmm5 ; xmm5=2*CbO
+ movdqa xmm6, xmm0 ; xmm6=CrE
+ movdqa xmm7, xmm1 ; xmm7=CrO
+ paddw xmm0, xmm0 ; xmm0=2*CrE
+ paddw xmm1, xmm1 ; xmm1=2*CrO
+
+ pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
+ pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
+ pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
+ pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
+
+ paddw xmm4, [rel PW_ONE]
+ paddw xmm5, [rel PW_ONE]
+ psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
+ psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
+ paddw xmm0, [rel PW_ONE]
+ paddw xmm1, [rel PW_ONE]
+ psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
+ psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
+
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm4, xmm6
+ pmaddwd xmm2, [rel PW_MF0344_F0285]
+ pmaddwd xmm4, [rel PW_MF0344_F0285]
+ punpcklwd xmm3, xmm7
+ punpckhwd xmm5, xmm7
+ pmaddwd xmm3, [rel PW_MF0344_F0285]
+ pmaddwd xmm5, [rel PW_MF0344_F0285]
+
+ paddd xmm2, [rel PD_ONEHALF]
+ paddd xmm4, [rel PD_ONEHALF]
+ psrad xmm2, SCALEBITS
+ psrad xmm4, SCALEBITS
+ paddd xmm3, [rel PD_ONEHALF]
+ paddd xmm5, [rel PD_ONEHALF]
+ psrad xmm3, SCALEBITS
+ psrad xmm5, SCALEBITS
+
+ packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
+ psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
+
+ paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+ paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub rcx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ mov byte [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ cmp rcx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD/8*4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ movd XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+ pop rcx
+ pop rsi
+ pop rbx
+ pop rdx
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ add rdi, byte SIZEOF_JSAMPROW ; output_buf
+ dec rax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ pop rbx
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdcolor-avx2.asm b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
new file mode 100644
index 0000000000..43de9db04d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402 times 16 dw F_0_402
+PW_MF0228 times 16 dw -F_0_228
+PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
+PW_ONE times 16 dw 1
+PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdcolor-sse2.asm b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b3f1fec07e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-avx2.asm b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
new file mode 100644
index 0000000000..9515a17013
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402 times 16 dw F_0_402
+PW_MF0228 times 16 dw -F_0_228
+PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
+PW_ONE times 16 dw 1
+PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-sse2.asm b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
new file mode 100644
index 0000000000..aedccc20f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..8b264b4f03
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
@@ -0,0 +1,596 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 3
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+ push rbx
+
+ mov ecx, r10d ; col
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
+ mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
+ mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ pop rcx ; col
+
+.columnloop:
+
+ vmovdqu ymm6, YMMWORD [rbx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+ vmovdqu ymm7, YMMWORD [rdx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+ vpcmpeqw ymm3, ymm3, ymm3
+ vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+ vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+ vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL
+ vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+ vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL
+ vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+ vpaddw ymm5, ymm6, ymm3
+ vpaddw ymm2, ymm4, ymm3
+ vpaddw ymm1, ymm7, ymm3
+ vpaddw ymm3, ymm0, ymm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH
+ vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL
+ vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH
+ vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL
+
+ vpmulhw ymm6, ymm6, [rel PW_MF0228] ; ymm6=(2*CbH * -FIX(0.22800))
+ vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbL * -FIX(0.22800))
+ vpmulhw ymm7, ymm7, [rel PW_F0402] ; ymm7=(2*CrH * FIX(0.40200))
+ vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrL * FIX(0.40200))
+
+ vpaddw ymm6, ymm6, [rel PW_ONE]
+ vpaddw ymm4, ymm4, [rel PW_ONE]
+ vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800))
+ vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800))
+ vpaddw ymm7, ymm7, [rel PW_ONE]
+ vpaddw ymm0, ymm0, [rel PW_ONE]
+ vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200))
+ vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200))
+
+ vpaddw ymm6, ymm6, ymm5
+ vpaddw ymm4, ymm4, ymm2
+ vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+ vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+ vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+ vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H
+ vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H
+
+ vpunpckhwd ymm6, ymm5, ymm1
+ vpunpcklwd ymm5, ymm5, ymm1
+ vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285]
+ vpmaddwd ymm6, ymm6, [rel PW_MF0344_F0285]
+ vpunpckhwd ymm7, ymm2, ymm3
+ vpunpcklwd ymm2, ymm2, ymm3
+ vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285]
+ vpmaddwd ymm7, ymm7, [rel PW_MF0344_F0285]
+
+ vpaddd ymm5, ymm5, [rel PD_ONEHALF]
+ vpaddd ymm6, ymm6, [rel PD_ONEHALF]
+ vpsrad ymm5, ymm5, SCALEBITS
+ vpsrad ymm6, ymm6, SCALEBITS
+ vpaddd ymm2, ymm2, [rel PD_ONEHALF]
+ vpaddd ymm7, ymm7, [rel PD_ONEHALF]
+ vpsrad ymm2, ymm2, SCALEBITS
+ vpsrad ymm7, ymm7, SCALEBITS
+
+ vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+
+.Yloop_2nd:
+ vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H
+ vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H
+ vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H
+
+.Yloop_1st:
+ vmovdqu ymm7, YMMWORD [rsi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+ vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE
+ vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+ vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H)
+ vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H)
+ vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H)
+
+ vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+ vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+ vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
+ vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+ vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+ vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
+ vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+ vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+ vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
+ vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+ ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+ ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+ ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+ vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+ ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+ vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+ ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+ ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+ vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+ ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+ vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+ ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+ vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+ ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+ vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+ ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+ vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+ ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+ vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+ ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+ vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+ ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+ vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+ vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+ vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+ ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+ vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+ ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+ vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+ ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+ vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+ ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+ vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+ vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+ vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test rdi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_YMMWORD ; inptr1
+ add rdx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st64:
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_YMMWORD
+ jb short .column_st32
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmF
+ sub rcx, byte 2*SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st32:
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st31
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ add rdi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub rcx, byte SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st31:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ vmovq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ vpsrldq xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ vmovd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ vpsrldq xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ vmovd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ mov byte [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+ vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+ ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+ vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+ vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+ ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+ ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+ vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+ ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+ vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+ ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+ vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+ vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+ vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test rdi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+ vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+ vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+ add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr0
+ dec al
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_YMMWORD ; inptr1
+ add rdx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st64:
+ cmp rcx, byte SIZEOF_YMMWORD/2
+ jb short .column_st32
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmC
+ vmovdqa ymmD, ymmH
+ sub rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+ cmp rcx, byte SIZEOF_YMMWORD/4
+ jb short .column_st16
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ add rdi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+ cmp rcx, byte SIZEOF_YMMWORD/8
+ jb short .column_st15
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+ ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_YMMWORD/16
+ jb short .column_st7
+ vmovq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_YMMWORD/16*4
+ sub rcx, byte SIZEOF_YMMWORD/16
+ vpsrldq xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+ ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ vmovd XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ mov eax, r10d
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+ sub rsp, SIZEOF_JSAMPARRAY*4
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+ add rdi, byte SIZEOF_JSAMPROW ; outptr1
+ add rsi, byte SIZEOF_JSAMPROW ; inptr01
+
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+ add rsp, SIZEOF_JSAMPARRAY*4
+
+ pop rbx
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..eb3ab9dbd9
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
@@ -0,0 +1,538 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 3
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+ push rbx
+
+ mov ecx, r10d ; col
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
+ mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
+ mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ pop rcx ; col
+
+.columnloop:
+
+ movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
+ movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
+
+ pxor xmm1, xmm1 ; xmm1=(all 0's)
+ pcmpeqw xmm3, xmm3
+ psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ movdqa xmm4, xmm6
+ punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
+ punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
+ movdqa xmm0, xmm7
+ punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
+ punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
+
+ paddw xmm6, xmm3
+ paddw xmm4, xmm3
+ paddw xmm7, xmm3
+ paddw xmm0, xmm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm5, xmm6 ; xmm5=CbH
+ movdqa xmm2, xmm4 ; xmm2=CbL
+ paddw xmm6, xmm6 ; xmm6=2*CbH
+ paddw xmm4, xmm4 ; xmm4=2*CbL
+ movdqa xmm1, xmm7 ; xmm1=CrH
+ movdqa xmm3, xmm0 ; xmm3=CrL
+ paddw xmm7, xmm7 ; xmm7=2*CrH
+ paddw xmm0, xmm0 ; xmm0=2*CrL
+
+ pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
+ pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
+ pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
+ pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
+
+ paddw xmm6, [rel PW_ONE]
+ paddw xmm4, [rel PW_ONE]
+ psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
+ psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
+ paddw xmm7, [rel PW_ONE]
+ paddw xmm0, [rel PW_ONE]
+ psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
+ psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
+
+ paddw xmm6, xmm5
+ paddw xmm4, xmm2
+ paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
+
+ movdqa xmm6, xmm5
+ movdqa xmm7, xmm2
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm6, xmm1
+ pmaddwd xmm5, [rel PW_MF0344_F0285]
+ pmaddwd xmm6, [rel PW_MF0344_F0285]
+ punpcklwd xmm2, xmm3
+ punpckhwd xmm7, xmm3
+ pmaddwd xmm2, [rel PW_MF0344_F0285]
+ pmaddwd xmm7, [rel PW_MF0344_F0285]
+
+ paddd xmm5, [rel PD_ONEHALF]
+ paddd xmm6, [rel PD_ONEHALF]
+ psrad xmm5, SCALEBITS
+ psrad xmm6, SCALEBITS
+ paddd xmm2, [rel PD_ONEHALF]
+ paddd xmm7, [rel PD_ONEHALF]
+ psrad xmm2, SCALEBITS
+ psrad xmm7, SCALEBITS
+
+ packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+
+.Yloop_2nd:
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
+
+.Yloop_1st:
+ movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm6, xmm6
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
+ psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
+
+ movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
+ movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
+ movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
+
+ paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+ paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub rcx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ mov byte [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ cmp rcx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD/8*4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ movd XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ pop rbx
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ mov eax, r10d
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+ sub rsp, SIZEOF_JSAMPARRAY*4
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+ add rdi, byte SIZEOF_JSAMPROW ; outptr1
+ add rsi, byte SIZEOF_JSAMPROW ; inptr01
+
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+ add rsp, SIZEOF_JSAMPARRAY*4
+
+ pop rbx
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdsample-avx2.asm b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
new file mode 100644
index 0000000000..1e4979f933
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
@@ -0,0 +1,696 @@
+;
+; jdsample.asm - upsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE times 16 dw 1
+PW_TWO times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ push_xmm 3
+ collect_args 4
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+
+ vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
+ vpcmpeqb xmm9, xmm9, xmm9
+ vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
+
+ vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-1)
+ vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
+
+.rowloop:
+ push rax ; colctr
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ test rax, SIZEOF_YMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ vpand ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+ add rax, byte SIZEOF_YMMWORD-1
+ and rax, byte -SIZEOF_YMMWORD
+ cmp rax, byte SIZEOF_YMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ vpand ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ jmp short .upsample
+
+.columnloop:
+ vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vperm2i128 ymm6, ymm0, ymm6, 0x20
+ vpslldq ymm6, ymm6, 15
+
+.upsample:
+ vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31)
+
+ vperm2i128 ymm2, ymm0, ymm1, 0x20
+ vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30)
+ vperm2i128 ymm4, ymm0, ymm1, 0x03
+ vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --)
+
+ vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30)
+ vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32)
+
+ vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --)
+
+ vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+ vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22)
+ vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24)
+ vpunpcklbw ymm8, ymm3, ymm0 ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+ vperm2i128 ymm3, ymm8, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vperm2i128 ymm6, ymm8, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vpmullw ymm1, ymm1, [rel PW_THREE]
+ vpmullw ymm4, ymm4, [rel PW_THREE]
+ vpaddw ymm2, ymm2, [rel PW_ONE]
+ vpaddw ymm5, ymm5, [rel PW_ONE]
+ vpaddw ymm3, ymm3, [rel PW_TWO]
+ vpaddw ymm6, ymm6, [rel PW_TWO]
+
+ vpaddw ymm2, ymm2, ymm1
+ vpaddw ymm5, ymm5, ymm4
+ vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm3, ymm3, ymm1
+ vpaddw ymm6, ymm6, ymm4
+ vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm3, ymm3, BYTE_BIT
+ vpsllw ymm6, ymm6, BYTE_BIT
+ vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31)
+ vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
+
+ sub rax, byte SIZEOF_YMMWORD
+ add rsi, byte 1*SIZEOF_YMMWORD ; inptr
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ cmp rax, byte SIZEOF_YMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ uncollect_args 4
+ pop_xmm 3
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 4
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ push_xmm 3
+ collect_args 4
+ push rbx
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rax ; colctr
+ push rcx
+ push rdi
+ push rsi
+
+ mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's)
+ vpcmpeqb xmm9, xmm9, xmm9
+ vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
+ vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-2)
+ vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+ test rax, SIZEOF_YMMWORD-1
+ jz short .skip
+ push rdx
+ mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop rdx
+.skip:
+ ; -- process the first column block
+
+ vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0]
+ vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
+ vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
+
+ vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm3, ymm2, ymm8 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpmullw ymm0, ymm0, [rel PW_THREE]
+ vpmullw ymm4, ymm4, [rel PW_THREE]
+
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save
+ vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
+
+ vpand ymm1, ymm1, ymm10 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vpand ymm2, ymm2, ymm10 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+ vmovdqa YMMWORD [wk(0)], ymm1
+ vmovdqa YMMWORD [wk(1)], ymm2
+
+ add rax, byte SIZEOF_YMMWORD-1
+ and rax, byte -SIZEOF_YMMWORD
+ cmp rax, byte SIZEOF_YMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ ; -- process the last column block
+
+ vpand ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+ vpand ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
+
+ vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+ vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+ jmp near .upsample
+
+.columnloop:
+ ; -- process the next column block
+
+ vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1]
+ vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
+ vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
+
+ vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm7, ymm2, ymm8 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpmullw ymm0, ymm0, [rel PW_THREE]
+ vpmullw ymm4, ymm4, [rel PW_THREE]
+
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save
+ vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
+
+ vperm2i128 ymm1, ymm8, ymm1, 0x20
+ vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
+ vperm2i128 ymm2, ymm8, ymm2, 0x20
+ vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
+
+ vmovdqa YMMWORD [wk(2)], ymm1
+ vmovdqa YMMWORD [wk(3)], ymm2
+
+.upsample:
+ ; -- process the upper row
+
+ vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vperm2i128 ymm0, ymm8, ymm7, 0x03
+ vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
+ vperm2i128 ymm4, ymm8, ymm3, 0x20
+ vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+ vperm2i128 ymm5, ymm8, ymm7, 0x03
+ vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm6, ymm8, ymm3, 0x20
+ vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vperm2i128 ymm2, ymm8, ymm3, 0x03
+ vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+ vperm2i128 ymm4, ymm8, ymm3, 0x03
+ vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm1, ymm8, ymm7, 0x20
+ vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+
+ vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vmovdqa YMMWORD [wk(0)], ymm4
+
+ vpmullw ymm7, ymm7, [rel PW_THREE]
+ vpmullw ymm3, ymm3, [rel PW_THREE]
+ vpaddw ymm1, ymm1, [rel PW_EIGHT]
+ vpaddw ymm5, ymm5, [rel PW_EIGHT]
+ vpaddw ymm0, ymm0, [rel PW_SEVEN]
+ vpaddw ymm2, [rel PW_SEVEN]
+
+ vpaddw ymm1, ymm1, ymm7
+ vpaddw ymm5, ymm5, ymm3
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm3
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpsllw ymm2, ymm2, BYTE_BIT
+ vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31)
+ vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
+
+ ; -- process the lower row
+
+ vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vperm2i128 ymm7, ymm8, ymm6, 0x03
+ vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
+ vperm2i128 ymm3, ymm8, ymm4, 0x20
+ vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+ vperm2i128 ymm0, ymm8, ymm6, 0x03
+ vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm2, ymm8, ymm4, 0x20
+ vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vperm2i128 ymm5, ymm8, ymm4, 0x03
+ vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+ vperm2i128 ymm3, ymm8, ymm4, 0x03
+ vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm1, ymm8, ymm6, 0x20
+ vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+
+ vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vmovdqa YMMWORD [wk(1)], ymm3
+
+ vpmullw ymm6, ymm6, [rel PW_THREE]
+ vpmullw ymm4, ymm4, [rel PW_THREE]
+ vpaddw ymm1, ymm1, [rel PW_EIGHT]
+ vpaddw ymm0, ymm0, [rel PW_EIGHT]
+ vpaddw ymm7, ymm7, [rel PW_SEVEN]
+ vpaddw ymm5, ymm5, [rel PW_SEVEN]
+
+ vpaddw ymm1, ymm1, ymm6
+ vpaddw ymm0, ymm0, ymm4
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm7, ymm7, ymm6
+ vpaddw ymm5, ymm5, ymm4
+ vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpsllw ymm5, ymm5, BYTE_BIT
+ vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31)
+ vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
+
+ sub rax, byte SIZEOF_YMMWORD
+ add rcx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
+ add rbx, byte 1*SIZEOF_YMMWORD ; inptr0
+ add rsi, byte 1*SIZEOF_YMMWORD ; inptr1(below)
+ add rdx, byte 2*SIZEOF_YMMWORD ; outptr0
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr1
+ cmp rax, byte SIZEOF_YMMWORD
+ ja near .columnloop
+ test rax, rax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rcx
+ pop rax
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 4
+ pop_xmm 3
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+
+ mov edx, r11d
+ add rdx, byte (SIZEOF_YMMWORD-1)
+ and rdx, -SIZEOF_YMMWORD
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz short .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+ mov rax, rdx ; colctr
+.columnloop:
+
+ cmp rax, byte SIZEOF_YMMWORD
+ ja near .above_16
+
+ vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vpunpckhbw xmm1, xmm0, xmm0
+ vpunpcklbw xmm0, xmm0, xmm0
+
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ jmp short .nextrow
+
+.above_16:
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm0
+ vpunpcklbw ymm0, ymm0, ymm0
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+ sub rax, byte 2*SIZEOF_YMMWORD
+ jz short .nextrow
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg short .rowloop
+
+.return:
+ vzeroupper
+ uncollect_args 4
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ mov edx, r11d
+ add rdx, byte (SIZEOF_YMMWORD-1)
+ and rdx, -SIZEOF_YMMWORD
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rax, rdx ; colctr
+.columnloop:
+
+ cmp rax, byte SIZEOF_YMMWORD
+ ja short .above_16
+
+ vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ vpunpckhbw xmm1, xmm0, xmm0
+ vpunpcklbw xmm0, xmm0, xmm0
+
+ vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ jmp near .nextrow
+
+.above_16:
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm0
+ vpunpcklbw ymm0, ymm0, ymm0
+
+ vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+ sub rax, byte 2*SIZEOF_YMMWORD
+ jz short .nextrow
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr
+ add rbx, 2*SIZEOF_YMMWORD ; outptr0
+ add rdi, 2*SIZEOF_YMMWORD ; outptr1
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdsample-sse2.asm b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
new file mode 100644
index 0000000000..38dbceec26
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
@@ -0,0 +1,665 @@
+;
+; jdsample.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE times 8 dw 1
+PW_TWO times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rax ; colctr
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ test rax, SIZEOF_XMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ pxor xmm0, xmm0 ; xmm0=(all 0's)
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-1)
+ pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+ add rax, byte SIZEOF_XMMWORD-1
+ and rax, byte -SIZEOF_XMMWORD
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ pcmpeqb xmm6, xmm6
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+ pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ jmp short .upsample
+
+.columnloop:
+ movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
+ pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
+ psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
+
+ por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
+ por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
+
+ movdqa xmm7, xmm1
+ psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+ movdqa xmm4, xmm1
+ punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm2
+ punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
+ punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
+ movdqa xmm6, xmm3
+ punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
+ punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
+
+ pmullw xmm1, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+ paddw xmm2, [rel PW_ONE]
+ paddw xmm5, [rel PW_ONE]
+ paddw xmm3, [rel PW_TWO]
+ paddw xmm6, [rel PW_TWO]
+
+ paddw xmm2, xmm1
+ paddw xmm5, xmm4
+ psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+ paddw xmm3, xmm1
+ paddw xmm6, xmm4
+ psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm3, BYTE_BIT
+ psllw xmm6, BYTE_BIT
+ por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+ sub rax, byte SIZEOF_XMMWORD
+ add rsi, byte 1*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ cmp rax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args 4
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 4
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+ push rbx
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rax ; colctr
+ push rcx
+ push rdi
+ push rsi
+
+ mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test rax, SIZEOF_XMMWORD-1
+ jz short .skip
+ push rdx
+ mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop rdx
+.skip:
+ ; -- process the first column block
+
+ movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
+ movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
+ movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-2)
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+ pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
+ pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
+
+ movdqa XMMWORD [wk(0)], xmm1
+ movdqa XMMWORD [wk(1)], xmm2
+
+ add rax, byte SIZEOF_XMMWORD-1
+ and rax, byte -SIZEOF_XMMWORD
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pcmpeqb xmm1, xmm1
+ pslldq xmm1, (SIZEOF_XMMWORD-2)
+ movdqa xmm2, xmm1
+
+ pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+ pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+ movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
+
+ jmp near .upsample
+
+.columnloop:
+ ; -- process the next column block
+
+ movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
+ movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+ pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
+ pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
+
+ movdqa XMMWORD [wk(2)], xmm1
+ movdqa XMMWORD [wk(3)], xmm2
+
+.upsample:
+ ; -- process the upper row
+
+ movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+ movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
+ pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm3
+ psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+ pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
+
+ por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
+ por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm3
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
+ movdqa xmm4, xmm3
+ psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(0)], xmm4
+
+ pmullw xmm7, [rel PW_THREE]
+ pmullw xmm3, [rel PW_THREE]
+ paddw xmm1, [rel PW_EIGHT]
+ paddw xmm5, [rel PW_EIGHT]
+ paddw xmm0, [rel PW_SEVEN]
+ paddw xmm2, [rel PW_SEVEN]
+
+ paddw xmm1, xmm7
+ paddw xmm5, xmm3
+ psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+ paddw xmm0, xmm7
+ paddw xmm2, xmm3
+ psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm0, BYTE_BIT
+ psllw xmm2, BYTE_BIT
+ por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+ ; -- process the lower row
+
+ movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+ movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
+ pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
+ movdqa xmm0, xmm6
+ movdqa xmm2, xmm4
+ psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+ pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
+
+ por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
+ por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
+ movdqa xmm3, xmm4
+ psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(1)], xmm3
+
+ pmullw xmm6, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+ paddw xmm1, [rel PW_EIGHT]
+ paddw xmm0, [rel PW_EIGHT]
+ paddw xmm7, [rel PW_SEVEN]
+ paddw xmm5, [rel PW_SEVEN]
+
+ paddw xmm1, xmm6
+ paddw xmm0, xmm4
+ psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+ paddw xmm7, xmm6
+ paddw xmm5, xmm4
+ psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm7, BYTE_BIT
+ psllw xmm5, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
+ por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+ sub rax, byte SIZEOF_XMMWORD
+ add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
+ add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
+ add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
+ add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
+ cmp rax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test rax, rax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rcx
+ pop rax
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop rbx
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+
+ mov edx, r11d
+ add rdx, byte (2*SIZEOF_XMMWORD)-1
+ and rdx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz short .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+ mov rax, rdx ; colctr
+.columnloop:
+
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
+
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 4*SIZEOF_XMMWORD ; outptr
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg short .rowloop
+
+.return:
+ uncollect_args 4
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ mov edx, r11d
+ add rdx, byte (2*SIZEOF_XMMWORD)-1
+ and rdx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rax, rdx ; colctr
+.columnloop:
+
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+
+ movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
+
+ movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
+ add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop rbx
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jfdctflt-sse.asm b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
new file mode 100644
index 0000000000..ef2796649b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
@@ -0,0 +1,355 @@
+;
+; jfdctflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+; r10 = FAST_FLOAT *data
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 1
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (FAST_FLOAT *)
+ mov rcx, DCTSIZE/4
+.rowloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+ ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
+ unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
+ unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+ ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
+ unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [rel PD_0_707] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [rel PD_0_707] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [rel PD_0_382] ; xmm2=z5
+ mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov rdx, r10 ; (FAST_FLOAT *)
+ mov rcx, DCTSIZE/4
+.columnloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+ ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
+ unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
+ unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+ ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
+ unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
+ unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [rel PD_0_707] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [rel PD_0_707] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [rel PD_0_382] ; xmm2=z5
+ mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add rdx, byte 4*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz near .columnloop
+
+ uncollect_args 1
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..2e1bfe6e8c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
@@ -0,0 +1,389 @@
+;
+; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 1
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ psubw xmm3, xmm1 ; xmm3=tmp13
+ psubw xmm6, xmm7 ; xmm6=tmp12
+ paddw xmm4, xmm1 ; xmm4=tmp10
+ paddw xmm0, xmm7 ; xmm0=tmp11
+
+ paddw xmm6, xmm3
+ psllw xmm6, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm6, [rel PW_F0707] ; xmm6=z1
+
+ movdqa xmm1, xmm4
+ movdqa xmm7, xmm3
+ psubw xmm4, xmm0 ; xmm4=data4
+ psubw xmm3, xmm6 ; xmm3=data6
+ paddw xmm1, xmm0 ; xmm1=data0
+ paddw xmm7, xmm6 ; xmm7=data2
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
+
+ ; -- Odd part
+
+ paddw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm5, xmm0 ; xmm5=tmp11
+ paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [rel PW_F0707] ; xmm5=z3
+
+ movdqa xmm4, xmm2 ; xmm4=tmp10
+ psubw xmm2, xmm0
+ pmulhw xmm2, [rel PW_F0382] ; xmm2=z5
+ pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm2 ; xmm4=z2
+ paddw xmm0, xmm2 ; xmm0=z4
+
+ movdqa xmm3, xmm6
+ psubw xmm6, xmm5 ; xmm6=z13
+ paddw xmm3, xmm5 ; xmm3=z11
+
+ movdqa xmm2, xmm6
+ movdqa xmm5, xmm3
+ psubw xmm6, xmm4 ; xmm6=data3
+ psubw xmm3, xmm0 ; xmm3=data7
+ paddw xmm2, xmm4 ; xmm2=data5
+ paddw xmm5, xmm0 ; xmm5=data1
+
+ ; ---- Pass 2: process columns.
+
+ ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+ ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
+
+ ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+ ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
+ movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
+ punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm5, xmm6
+ movdqa xmm3, xmm1
+ psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
+ psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
+ paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
+ paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
+
+ movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm7, xmm6
+ movdqa xmm0, xmm2
+ paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
+ paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
+ psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
+ psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm1, xmm5
+ psubw xmm3, xmm6 ; xmm3=tmp13
+ psubw xmm5, xmm2 ; xmm5=tmp12
+ paddw xmm4, xmm6 ; xmm4=tmp10
+ paddw xmm1, xmm2 ; xmm1=tmp11
+
+ paddw xmm5, xmm3
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [rel PW_F0707] ; xmm5=z1
+
+ movdqa xmm6, xmm4
+ movdqa xmm2, xmm3
+ psubw xmm4, xmm1 ; xmm4=data4
+ psubw xmm3, xmm5 ; xmm3=data6
+ paddw xmm6, xmm1 ; xmm6=data0
+ paddw xmm2, xmm5 ; xmm2=data2
+
+ movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+ ; -- Odd part
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ paddw xmm7, xmm0 ; xmm7=tmp10
+ paddw xmm0, xmm1 ; xmm0=tmp11
+ paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
+
+ psllw xmm7, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm0, [rel PW_F0707] ; xmm0=z3
+
+ movdqa xmm4, xmm7 ; xmm4=tmp10
+ psubw xmm7, xmm1
+ pmulhw xmm7, [rel PW_F0382] ; xmm7=z5
+ pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm7 ; xmm4=z2
+ paddw xmm1, xmm7 ; xmm1=z4
+
+ movdqa xmm3, xmm5
+ psubw xmm5, xmm0 ; xmm5=z13
+ paddw xmm3, xmm0 ; xmm3=z11
+
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm3
+ psubw xmm5, xmm4 ; xmm5=data3
+ psubw xmm3, xmm1 ; xmm3=data7
+ paddw xmm6, xmm4 ; xmm6=data5
+ paddw xmm2, xmm1 ; xmm2=data1
+
+ movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+ movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+ uncollect_args 1
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-avx2.asm b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
new file mode 100644
index 0000000000..e56258b48a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
@@ -0,0 +1,320 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+ ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+
+ vpunpcklwd %5, %1, %2
+ vpunpckhwd %6, %1, %2
+ vpunpcklwd %7, %3, %4
+ vpunpckhwd %8, %3, %4
+ ; transpose coefficients(phase 1)
+ ; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53)
+ ; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57)
+ ; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73)
+ ; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77)
+
+ vpunpckldq %1, %5, %7
+ vpunpckhdq %2, %5, %7
+ vpunpckldq %3, %6, %8
+ vpunpckhdq %4, %6, %8
+ ; transpose coefficients(phase 2)
+ ; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
+ ; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
+ ; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
+ ; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
+
+ vpermq %1, %1, 0x8D
+ vpermq %2, %2, 0x8D
+ vpermq %3, %3, 0xD8
+ vpermq %4, %4, 0xD8
+ ; transpose coefficients(phase 3)
+ ; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70)
+ ; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
+ ; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
+ ; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9: Pass (1 or 2)
+
+%macro dodct 9
+ vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7
+ vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0
+ vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2
+ vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5
+
+ ; -- Even part
+
+ vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1
+ vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11
+ vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12
+
+ vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10
+ vpsignw %1, %1, [rel PW_1_NEG1] ; %1=tmp10_neg11
+ vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+ vpsllw %1, %7, PASS1_BITS ; %1=data0_4
+%else
+ vpaddw %7, %7, [rel PW_DESCALE_P2X]
+ vpsraw %1, %7, PASS1_BITS ; %1=data0_4
+%endif
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13
+ vpunpcklwd %2, %6, %7
+ vpunpckhwd %6, %6, %7
+ vpmaddwd %2, %2, [rel PW_F130_F054_MF130_F054] ; %2=data2_6L
+ vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=data2_6H
+
+ vpaddd %2, %2, [rel PD_DESCALE_P %+ %9]
+ vpaddd %6, %6, [rel PD_DESCALE_P %+ %9]
+ vpsrad %2, %2, DESCALE_P %+ %9
+ vpsrad %6, %6, DESCALE_P %+ %9
+
+ vpackssdw %3, %2, %6 ; %6=data2_6
+
+ ; -- Odd part
+
+ vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3
+ vpunpcklwd %6, %7, %2
+ vpunpckhwd %7, %7, %2
+ vpmaddwd %6, %6, [rel PW_MF078_F117_F078_F117] ; %6=z3_4L
+ vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6
+ vpunpcklwd %2, %8, %4
+ vpunpckhwd %4, %8, %4
+ vpmaddwd %2, %2, [rel PW_MF060_MF089_MF050_MF256] ; %2=tmp4_5L
+ vpmaddwd %4, %4, [rel PW_MF060_MF089_MF050_MF256] ; %4=tmp4_5H
+
+ vpaddd %2, %2, %6 ; %2=data7_5L
+ vpaddd %4, %4, %7 ; %4=data7_5H
+
+ vpaddd %2, %2, [rel PD_DESCALE_P %+ %9]
+ vpaddd %4, %4, [rel PD_DESCALE_P %+ %9]
+ vpsrad %2, %2, DESCALE_P %+ %9
+ vpsrad %4, %4, DESCALE_P %+ %9
+
+ vpackssdw %4, %2, %4 ; %4=data7_5
+
+ vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4
+ vpunpcklwd %8, %5, %2
+ vpunpckhwd %5, %5, %2
+ vpmaddwd %8, %8, [rel PW_F050_MF256_F060_MF089] ; %8=tmp6_7L
+ vpmaddwd %5, %5, [rel PW_F050_MF256_F060_MF089] ; %5=tmp6_7H
+
+ vpaddd %8, %8, %6 ; %8=data3_1L
+ vpaddd %5, %5, %7 ; %5=data3_1H
+
+ vpaddd %8, %8, [rel PD_DESCALE_P %+ %9]
+ vpaddd %5, %5, [rel PD_DESCALE_P %+ %9]
+ vpsrad %8, %8, DESCALE_P %+ %9
+ vpsrad %5, %5, DESCALE_P %+ %9
+
+ vpackssdw %2, %8, %5 ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+ times 4 dw (F_0_541 - F_1_847), F_0_541
+PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+ times 4 dw (F_1_175 - F_0_390), F_1_175
+PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
+ times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562
+ times 4 dw (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1)
+PW_1_NEG1 times 8 dw 1
+ times 8 dw -1
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 1
+
+ ; ---- Pass 1: process rows.
+
+ vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)]
+ ; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ ; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ ; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ ; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ vperm2i128 ymm0, ymm4, ymm6, 0x20
+ vperm2i128 ymm1, ymm4, ymm6, 0x31
+ vperm2i128 ymm2, ymm5, ymm7, 0x20
+ vperm2i128 ymm3, ymm5, ymm7, 0x31
+ ; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+
+ dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+ dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+ ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+ ; ---- Pass 2: process columns.
+
+ vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7
+ vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5
+
+ dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+ dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+ ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+ vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1
+ vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3
+ vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5
+ vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7
+
+ vmovdqu YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3
+ vmovdqu YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5
+ vmovdqu YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6
+ vmovdqu YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
+
+ vzeroupper
+ uncollect_args 1
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-sse2.asm b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
new file mode 100644
index 0000000000..ec1f383ccb
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
@@ -0,0 +1,619 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 6
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 1
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ paddw xmm3, xmm1 ; xmm3=tmp10
+ paddw xmm6, xmm7 ; xmm6=tmp11
+ psubw xmm4, xmm1 ; xmm4=tmp13
+ psubw xmm0, xmm7 ; xmm0=tmp12
+
+ movdqa xmm1, xmm3
+ paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
+ psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
+
+ psllw xmm3, PASS1_BITS ; xmm3=data0
+ psllw xmm1, PASS1_BITS ; xmm1=data4
+
+ movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
+ movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm7, xmm4 ; xmm4=tmp13
+ movdqa xmm6, xmm4
+ punpcklwd xmm7, xmm0 ; xmm0=tmp12
+ punpckhwd xmm6, xmm0
+ movdqa xmm4, xmm7
+ movdqa xmm0, xmm6
+ pmaddwd xmm7, [rel PW_F130_F054] ; xmm7=data2L
+ pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=data2H
+ pmaddwd xmm4, [rel PW_F054_MF130] ; xmm4=data6L
+ pmaddwd xmm0, [rel PW_F054_MF130] ; xmm0=data6H
+
+ paddd xmm7, [rel PD_DESCALE_P1]
+ paddd xmm6, [rel PD_DESCALE_P1]
+ psrad xmm7, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+ paddd xmm4, [rel PD_DESCALE_P1]
+ paddd xmm0, [rel PD_DESCALE_P1]
+ psrad xmm4, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm7, xmm6 ; xmm7=data2
+ packssdw xmm4, xmm0 ; xmm4=data6
+
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
+
+ ; -- Odd part
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
+
+ movdqa xmm6, xmm2 ; xmm2=tmp4
+ movdqa xmm0, xmm5 ; xmm5=tmp5
+ paddw xmm6, xmm3 ; xmm6=z3
+ paddw xmm0, xmm1 ; xmm0=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm7, xmm6
+ movdqa xmm4, xmm6
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm4, xmm0
+ movdqa xmm6, xmm7
+ movdqa xmm0, xmm4
+ pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3L
+ pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3H
+ pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4L
+ pmaddwd xmm0, [rel PW_F117_F078] ; xmm0=z4H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ punpcklwd xmm7, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm2, xmm7
+ movdqa xmm1, xmm4
+ pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp4L
+ pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4H
+ pmaddwd xmm2, [rel PW_MF089_F060] ; xmm2=tmp7L
+ pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp7H
+
+ paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
+ paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
+ paddd xmm2, xmm6 ; xmm2=data1L
+ paddd xmm1, xmm0 ; xmm1=data1H
+
+ paddd xmm7, [rel PD_DESCALE_P1]
+ paddd xmm4, [rel PD_DESCALE_P1]
+ psrad xmm7, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm2, [rel PD_DESCALE_P1]
+ paddd xmm1, [rel PD_DESCALE_P1]
+ psrad xmm2, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+
+ packssdw xmm7, xmm4 ; xmm7=data7
+ packssdw xmm2, xmm1 ; xmm2=data1
+
+ movdqa xmm4, xmm5
+ movdqa xmm1, xmm5
+ punpcklwd xmm4, xmm3
+ punpckhwd xmm1, xmm3
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm1
+ pmaddwd xmm4, [rel PW_MF050_MF256] ; xmm4=tmp5L
+ pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5H
+ pmaddwd xmm5, [rel PW_MF256_F050] ; xmm5=tmp6L
+ pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6H
+
+ paddd xmm4, xmm6 ; xmm4=data5L
+ paddd xmm1, xmm0 ; xmm1=data5H
+ paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
+ paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
+
+ paddd xmm4, [rel PD_DESCALE_P1]
+ paddd xmm1, [rel PD_DESCALE_P1]
+ psrad xmm4, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+ paddd xmm5, [rel PD_DESCALE_P1]
+ paddd xmm3, [rel PD_DESCALE_P1]
+ psrad xmm5, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+
+ packssdw xmm4, xmm1 ; xmm4=data5
+ packssdw xmm5, xmm3 ; xmm5=data3
+
+ ; ---- Pass 2: process columns.
+
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
+ movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
+
+ ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+ ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
+ movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
+
+ ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+ ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
+ movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm2, xmm5
+ movdqa xmm7, xmm6
+ psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
+ psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
+ paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
+
+ movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm0, xmm5
+ movdqa xmm3, xmm4
+ paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
+ paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
+ psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm1, xmm7
+ movdqa xmm6, xmm2
+ paddw xmm7, xmm5 ; xmm7=tmp10
+ paddw xmm2, xmm4 ; xmm2=tmp11
+ psubw xmm1, xmm5 ; xmm1=tmp13
+ psubw xmm6, xmm4 ; xmm6=tmp12
+
+ movdqa xmm5, xmm7
+ paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
+ psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
+
+ paddw xmm7, [rel PW_DESCALE_P2X]
+ paddw xmm5, [rel PW_DESCALE_P2X]
+ psraw xmm7, PASS1_BITS ; xmm7=data0
+ psraw xmm5, PASS1_BITS ; xmm5=data4
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+ movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm4, xmm1 ; xmm1=tmp13
+ movdqa xmm2, xmm1
+ punpcklwd xmm4, xmm6 ; xmm6=tmp12
+ punpckhwd xmm2, xmm6
+ movdqa xmm1, xmm4
+ movdqa xmm6, xmm2
+ pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=data2L
+ pmaddwd xmm2, [rel PW_F130_F054] ; xmm2=data2H
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=data6L
+ pmaddwd xmm6, [rel PW_F054_MF130] ; xmm6=data6H
+
+ paddd xmm4, [rel PD_DESCALE_P2]
+ paddd xmm2, [rel PD_DESCALE_P2]
+ psrad xmm4, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm1, [rel PD_DESCALE_P2]
+ paddd xmm6, [rel PD_DESCALE_P2]
+ psrad xmm1, DESCALE_P2
+ psrad xmm6, DESCALE_P2
+
+ packssdw xmm4, xmm2 ; xmm4=data2
+ packssdw xmm1, xmm6 ; xmm1=data6
+
+ movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+ ; -- Odd part
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ movdqa xmm2, xmm0 ; xmm0=tmp4
+ movdqa xmm6, xmm3 ; xmm3=tmp5
+ paddw xmm2, xmm7 ; xmm2=z3
+ paddw xmm6, xmm5 ; xmm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm4, xmm2
+ movdqa xmm1, xmm2
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm1, xmm6
+ movdqa xmm2, xmm4
+ movdqa xmm6, xmm1
+ pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3L
+ pmaddwd xmm1, [rel PW_MF078_F117] ; xmm1=z3H
+ pmaddwd xmm2, [rel PW_F117_F078] ; xmm2=z4L
+ pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm4, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm4, xmm5
+ punpckhwd xmm1, xmm5
+ movdqa xmm0, xmm4
+ movdqa xmm5, xmm1
+ pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4L
+ pmaddwd xmm1, [rel PW_MF060_MF089] ; xmm1=tmp4H
+ pmaddwd xmm0, [rel PW_MF089_F060] ; xmm0=tmp7L
+ pmaddwd xmm5, [rel PW_MF089_F060] ; xmm5=tmp7H
+
+ paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
+ paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
+ paddd xmm0, xmm2 ; xmm0=data1L
+ paddd xmm5, xmm6 ; xmm5=data1H
+
+ paddd xmm4, [rel PD_DESCALE_P2]
+ paddd xmm1, [rel PD_DESCALE_P2]
+ psrad xmm4, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm0, [rel PD_DESCALE_P2]
+ paddd xmm5, [rel PD_DESCALE_P2]
+ psrad xmm0, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+
+ packssdw xmm4, xmm1 ; xmm4=data7
+ packssdw xmm0, xmm5 ; xmm0=data1
+
+ movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+ movdqa xmm1, xmm3
+ movdqa xmm5, xmm3
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm5, xmm7
+ movdqa xmm3, xmm1
+ movdqa xmm7, xmm5
+ pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5L
+ pmaddwd xmm5, [rel PW_MF050_MF256] ; xmm5=tmp5H
+ pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6L
+ pmaddwd xmm7, [rel PW_MF256_F050] ; xmm7=tmp6H
+
+ paddd xmm1, xmm2 ; xmm1=data5L
+ paddd xmm5, xmm6 ; xmm5=data5H
+ paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
+ paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
+
+ paddd xmm1, [rel PD_DESCALE_P2]
+ paddd xmm5, [rel PD_DESCALE_P2]
+ psrad xmm1, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+ paddd xmm3, [rel PD_DESCALE_P2]
+ paddd xmm7, [rel PD_DESCALE_P2]
+ psrad xmm3, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm1, xmm5 ; xmm1=data5
+ packssdw xmm3, xmm7 ; xmm3=data3
+
+ movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+ uncollect_args 1
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctflt-sse2.asm b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
new file mode 100644
index 0000000000..60bf961896
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
@@ -0,0 +1,482 @@
+;
+; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp rbp + 0
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [workspace]
+ collect_args 4
+ push rbx
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+ lea rdi, [workspace] ; FAST_FLOAT *wsptr
+ mov rcx, DCTSIZE/4 ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, xmm2
+ por xmm3, xmm4
+ por xmm5, xmm6
+ por xmm1, xmm3
+ por xmm5, xmm7
+ por xmm1, xmm5
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, xmm0
+
+ shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
+
+ punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
+ punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [rel PD_1_414]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
+ punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
+
+ punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
+ punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
+ psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
+ cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [rel PD_1_847] ; xmm0=z5
+ mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm0, xmm7
+ movaps xmm3, xmm5
+ addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2, xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4, xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6, xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+ add rsi, byte 4*SIZEOF_JCOEF ; coef_block
+ add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec rcx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ lea rsi, [workspace] ; FAST_FLOAT *wsptr
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+ mov rcx, DCTSIZE/4 ; ctr
+.rowloop:
+
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [rel PD_1_414]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [rel PD_1_847] ; xmm0=z5
+ mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
+ pcmpeqd xmm3, xmm3
+ psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+ addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+ addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+ addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+ pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+ pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+ por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
+ por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
+
+ movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm7, xmm1
+ movaps xmm5, xmm3
+ addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
+ addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
+ subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
+ subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
+
+ movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
+ pcmpeqd xmm4, xmm4
+ psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+ addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+ addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+ addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+ pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+ pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+ por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
+ por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
+
+ packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+ packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+ paddb xmm6, xmm2
+ paddb xmm1, xmm2
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
+ punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+ pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+ add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add rdi, byte 4*SIZEOF_JSAMPROW
+ dec rcx ; ctr
+ jnz near .rowloop
+
+ pop rbx
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctfst-sse2.asm b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
new file mode 100644
index 0000000000..cb97fdfbb2
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
@@ -0,0 +1,491 @@
+;
+; jidctfst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp rbp + 0
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ psubw xmm0, xmm2 ; xmm0=tmp11
+ psubw xmm1, xmm3
+ paddw xmm4, xmm2 ; xmm4=tmp10
+ paddw xmm5, xmm3 ; xmm5=tmp13
+
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm1, [rel PW_F1414]
+ psubw xmm1, xmm5 ; xmm1=tmp12
+
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm0
+ psubw xmm4, xmm5 ; xmm4=tmp3
+ psubw xmm0, xmm1 ; xmm0=tmp2
+ paddw xmm6, xmm5 ; xmm6=tmp0
+ paddw xmm7, xmm1 ; xmm7=tmp1
+
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm2
+ movdqa xmm0, xmm5
+ psubw xmm2, xmm1 ; xmm2=z12
+ psubw xmm5, xmm3 ; xmm5=z10
+ paddw xmm4, xmm1 ; xmm4=z11
+ paddw xmm0, xmm3 ; xmm0=z13
+
+ movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm3, xmm4
+ psubw xmm4, xmm0
+ paddw xmm3, xmm0 ; xmm3=tmp7
+
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm0, xmm5
+ paddw xmm5, xmm2
+ pmulhw xmm5, [rel PW_F1847] ; xmm5=z5
+ pmulhw xmm0, [rel PW_MF1613]
+ pmulhw xmm2, [rel PW_F1082]
+ psubw xmm0, xmm1
+ psubw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm0, xmm5 ; xmm0=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm0, xmm3 ; xmm0=tmp6
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm7
+ paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
+ paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
+ psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
+ psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
+ psubw xmm4, xmm0 ; xmm4=tmp5
+
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
+ movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
+
+ paddw xmm2, xmm4 ; xmm2=tmp4
+ movdqa xmm5, xmm7
+ movdqa xmm0, xmm1
+ paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
+ paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
+ psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+ psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
+ punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
+
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
+ punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
+ punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; -- Even part
+
+ ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+ movdqa xmm2, xmm6
+ movdqa xmm0, xmm5
+ psubw xmm6, xmm1 ; xmm6=tmp11
+ psubw xmm5, xmm3
+ paddw xmm2, xmm1 ; xmm2=tmp10
+ paddw xmm0, xmm3 ; xmm0=tmp13
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [rel PW_F1414]
+ psubw xmm5, xmm0 ; xmm5=tmp12
+
+ movdqa xmm1, xmm2
+ movdqa xmm3, xmm6
+ psubw xmm2, xmm0 ; xmm2=tmp3
+ psubw xmm6, xmm5 ; xmm6=tmp2
+ paddw xmm1, xmm0 ; xmm1=tmp0
+ paddw xmm3, xmm5 ; xmm3=tmp1
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
+
+ ; -- Odd part
+
+ ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm4
+ psubw xmm0, xmm7 ; xmm0=z12
+ psubw xmm4, xmm5 ; xmm4=z10
+ paddw xmm2, xmm7 ; xmm2=z11
+ paddw xmm6, xmm5 ; xmm6=z13
+
+ movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm5, xmm2
+ psubw xmm2, xmm6
+ paddw xmm5, xmm6 ; xmm5=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm6, xmm4
+ paddw xmm4, xmm0
+ pmulhw xmm4, [rel PW_F1847] ; xmm4=z5
+ pmulhw xmm6, [rel PW_MF1613]
+ pmulhw xmm0, [rel PW_F1082]
+ psubw xmm6, xmm7
+ psubw xmm0, xmm4 ; xmm0=tmp10
+ paddw xmm6, xmm4 ; xmm6=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm6, xmm5 ; xmm6=tmp6
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm3
+ paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
+ paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ psraw xmm1, (PASS1_BITS+3) ; descale
+ psraw xmm3, (PASS1_BITS+3) ; descale
+ psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
+ psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+ psraw xmm7, (PASS1_BITS+3) ; descale
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psubw xmm2, xmm6 ; xmm2=tmp5
+
+ packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
+
+ paddw xmm0, xmm2 ; xmm0=tmp4
+ movdqa xmm4, xmm5
+ movdqa xmm7, xmm6
+ paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
+ paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
+ psraw xmm5, (PASS1_BITS+3) ; descale
+ psraw xmm6, (PASS1_BITS+3) ; descale
+ psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+ psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psraw xmm7, (PASS1_BITS+3) ; descale
+
+ movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
+
+ packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm1, xmm2
+ paddb xmm3, xmm2
+ paddb xmm5, xmm2
+ paddb xmm7, xmm2
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
+ punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+ mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+ mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctint-avx2.asm b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
new file mode 100644
index 0000000000..ca7e317f6e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
@@ -0,0 +1,418 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+ ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71)
+ ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
+ ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
+ ; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76)
+
+ vpermq %5, %1, 0xD8
+ vpermq %6, %2, 0x72
+ vpermq %7, %3, 0xD8
+ vpermq %8, %4, 0x72
+ ; transpose coefficients(phase 1)
+ ; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
+ ; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
+ ; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
+ ; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
+
+ vpunpcklwd %1, %5, %6
+ vpunpckhwd %2, %5, %6
+ vpunpcklwd %3, %7, %8
+ vpunpckhwd %4, %7, %8
+ ; transpose coefficients(phase 2)
+ ; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72)
+ ; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73)
+ ; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76)
+ ; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77)
+
+ vpunpcklwd %5, %1, %2
+ vpunpcklwd %6, %3, %4
+ vpunpckhwd %7, %1, %2
+ vpunpckhwd %8, %3, %4
+ ; transpose coefficients(phase 3)
+ ; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53)
+ ; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57)
+ ; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73)
+ ; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77)
+
+ vpunpcklqdq %1, %5, %6
+ vpunpckhqdq %2, %5, %6
+ vpunpcklqdq %3, %7, %8
+ vpunpckhqdq %4, %7, %8
+ ; transpose coefficients(phase 4)
+ ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%12: Temp registers
+; %9: Pass (1 or 2)
+
+%macro dodct 13
+ ; -- Even part
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2
+ vpunpcklwd %5, %3, %6 ; %5=in26_62L
+ vpunpckhwd %6, %3, %6 ; %6=in26_62H
+ vpmaddwd %5, %5, [rel PW_F130_F054_MF130_F054] ; %5=tmp3_2L
+ vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=tmp3_2H
+
+ vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0
+ vpsignw %1, %1, [rel PW_1_NEG1]
+ vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4)
+
+ vpxor %1, %1, %1
+ vpunpcklwd %8, %1, %7 ; %8=tmp0_1L
+ vpunpckhwd %1, %1, %7 ; %1=tmp0_1H
+ vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS
+ vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+ vpsubd %11, %8, %5 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+ vpaddd %9, %8, %5 ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+ vpsubd %12, %1, %6 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+ vpaddd %10, %1, %6 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+ ; -- Odd part
+
+ vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3
+ vpunpcklwd %7, %1, %8 ; %7=z34_43L
+ vpunpckhwd %8, %1, %8 ; %8=z34_43H
+ vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4L
+ vpmaddwd %8, %8, [rel PW_MF078_F117_F078_F117] ; %8=z3_4H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3
+ vpunpcklwd %3, %4, %2 ; %3=in71_53L
+ vpunpckhwd %4, %4, %2 ; %4=in71_53H
+
+ vpmaddwd %5, %3, [rel PW_MF060_MF089_MF050_MF256] ; %5=tmp0_1L
+ vpmaddwd %6, %4, [rel PW_MF060_MF089_MF050_MF256] ; %6=tmp0_1H
+ vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L
+ vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+ vpmaddwd %3, %3, [rel PW_MF089_F060_MF256_F050] ; %3=tmp3_2L
+ vpmaddwd %4, %4, [rel PW_MF089_F060_MF256_F050] ; %4=tmp3_2H
+ vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L
+ vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H
+ vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L
+ vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+ ; -- Final output stage
+
+ vpaddd %1, %9, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L
+ vpaddd %2, %10, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H
+ vpaddd %1, %1, [rel PD_DESCALE_P %+ %13]
+ vpaddd %2, %2, [rel PD_DESCALE_P %+ %13]
+ vpsrad %1, %1, DESCALE_P %+ %13
+ vpsrad %2, %2, DESCALE_P %+ %13
+ vpackssdw %1, %1, %2 ; %1=data0_1
+
+ vpsubd %3, %9, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L
+ vpsubd %4, %10, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H
+ vpaddd %3, %3, [rel PD_DESCALE_P %+ %13]
+ vpaddd %4, %4, [rel PD_DESCALE_P %+ %13]
+ vpsrad %3, %3, DESCALE_P %+ %13
+ vpsrad %4, %4, DESCALE_P %+ %13
+ vpackssdw %4, %3, %4 ; %4=data7_6
+
+ vpaddd %7, %11, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L
+ vpaddd %8, %12, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H
+ vpaddd %7, %7, [rel PD_DESCALE_P %+ %13]
+ vpaddd %8, %8, [rel PD_DESCALE_P %+ %13]
+ vpsrad %7, %7, DESCALE_P %+ %13
+ vpsrad %8, %8, DESCALE_P %+ %13
+ vpackssdw %2, %7, %8 ; %2=data3_2
+
+ vpsubd %7, %11, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L
+ vpsubd %8, %12, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H
+ vpaddd %7, %7, [rel PD_DESCALE_P %+ %13]
+ vpaddd %8, %8, [rel PD_DESCALE_P %+ %13]
+ vpsrad %7, %7, DESCALE_P %+ %13
+ vpsrad %8, %8, DESCALE_P %+ %13
+ vpackssdw %3, %7, %8 ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+ times 4 dw (F_0_541 - F_1_847), F_0_541
+PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+ times 4 dw (F_1_175 - F_0_390), F_1_175
+PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
+ times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+ times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 32 db CENTERJSAMPLE
+PW_1_NEG1 times 8 dw 1
+ times 8 dw -1
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ mov rbp, rsp ; rbp = aligned rbp
+ push_xmm 4
+ collect_args 4
+
+ ; ---- Pass 1: process columns.
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+ mov eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, xmm0
+ vpacksswb xmm1, xmm1, xmm1
+ vpacksswb xmm1, xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)]
+ vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+ vpsllw xmm5, xmm5, PASS1_BITS
+
+ vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03)
+ vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07)
+ vinserti128 ymm4, ymm4, xmm5, 1
+
+ vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04)
+ vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05)
+ vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06)
+ vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07)
+
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)] ; ymm4=in0_1
+ vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)] ; ymm5=in2_3
+ vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)] ; ymm6=in4_5
+ vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)] ; ymm7=in6_7
+ vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+ vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4
+ vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1
+ vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6
+ vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5
+
+ dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
+ ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+ dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+ ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows.
+
+ vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5
+ vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1
+
+ dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
+ ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+ dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+ ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+ vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45
+ vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67
+ vpaddb ymm0, ymm0, [rel PB_CENTERJSAMP]
+ vpaddb ymm1, ymm1, [rel PB_CENTERJSAMP]
+
+ vextracti128 xmm6, ymm1, 1 ; xmm3=data67
+ vextracti128 xmm4, ymm0, 1 ; xmm2=data45
+ vextracti128 xmm2, ymm1, 0 ; xmm1=data23
+ vextracti128 xmm0, ymm0, 0 ; xmm0=data01
+
+ vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ vzeroupper
+
+ mov eax, r13d
+
+ mov rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+
+ mov rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+ mov rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+ mov rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+ uncollect_args 4
+ pop_xmm 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctint-sse2.asm b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
new file mode 100644
index 0000000000..7aa869bc0b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
@@ -0,0 +1,847 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp rbp + 0
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 12
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm5, PASS1_BITS
+
+ movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm4, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm4, xmm3 ; xmm3=in6=z3
+ punpckhwd xmm5, xmm3
+ movdqa xmm1, xmm4
+ movdqa xmm3, xmm5
+ pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L
+ pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
+ pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H
+
+ movdqa xmm6, xmm0
+ paddw xmm0, xmm2 ; xmm0=in0+in4
+ psubw xmm6, xmm2 ; xmm6=in0-in4
+
+ pxor xmm7, xmm7
+ pxor xmm2, xmm2
+ punpcklwd xmm7, xmm0 ; xmm7=tmp0L
+ punpckhwd xmm2, xmm0 ; xmm2=tmp0H
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+ psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm4 ; xmm7=tmp10L
+ psubd xmm0, xmm4 ; xmm0=tmp13L
+ movdqa xmm4, xmm2
+ paddd xmm2, xmm5 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm7, xmm7
+ punpcklwd xmm5, xmm6 ; xmm5=tmp1L
+ punpckhwd xmm7, xmm6 ; xmm7=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+ movdqa xmm2, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm2, xmm1 ; xmm2=tmp12L
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm3 ; xmm7=tmp11H
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm5, xmm6
+ movdqa xmm7, xmm4
+ paddw xmm5, xmm3 ; xmm5=z3
+ paddw xmm7, xmm1 ; xmm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm5
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm0, xmm7
+ movdqa xmm5, xmm2
+ movdqa xmm7, xmm0
+ pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L
+ pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H
+ pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
+ pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm3
+ punpcklwd xmm2, xmm4
+ punpckhwd xmm0, xmm4
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm0
+ pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L
+ pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H
+ pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L
+ pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H
+
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
+ paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
+ paddd xmm3, xmm5 ; xmm3=tmp3L
+ paddd xmm4, xmm7 ; xmm4=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
+
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm1
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm0, xmm6
+ movdqa xmm1, xmm2
+ movdqa xmm6, xmm0
+ pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L
+ pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H
+ pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L
+ pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
+
+ paddd xmm2, xmm5 ; xmm2=tmp1L
+ paddd xmm0, xmm7 ; xmm0=tmp1H
+ paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm7
+ paddd xmm5, xmm3 ; xmm5=data0L
+ paddd xmm7, xmm4 ; xmm7=data0H
+ psubd xmm2, xmm3 ; xmm2=data7L
+ psubd xmm0, xmm4 ; xmm0=data7H
+
+ movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
+
+ paddd xmm5, xmm3
+ paddd xmm7, xmm3
+ psrad xmm5, DESCALE_P1
+ psrad xmm7, DESCALE_P1
+ paddd xmm2, xmm3
+ paddd xmm0, xmm3
+ psrad xmm2, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+ movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
+ movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
+
+ movdqa xmm7, xmm4
+ movdqa xmm0, xmm3
+ paddd xmm4, xmm1 ; xmm4=data1L
+ paddd xmm3, xmm6 ; xmm3=data1H
+ psubd xmm7, xmm1 ; xmm7=data6L
+ psubd xmm0, xmm6 ; xmm0=data6H
+
+ movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
+
+ paddd xmm4, xmm1
+ paddd xmm3, xmm1
+ psrad xmm4, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+ paddd xmm7, xmm1
+ paddd xmm0, xmm1
+ psrad xmm7, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
+ movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
+ movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
+ movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm0
+ paddd xmm3, xmm4 ; xmm3=data2L
+ paddd xmm0, xmm2 ; xmm0=data2H
+ psubd xmm5, xmm4 ; xmm5=data5L
+ psubd xmm6, xmm2 ; xmm6=data5H
+
+ movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
+
+ paddd xmm3, xmm7
+ paddd xmm0, xmm7
+ psrad xmm3, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+ paddd xmm5, xmm7
+ paddd xmm6, xmm7
+ psrad xmm5, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
+ packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
+ movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
+ movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
+ movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
+
+ movdqa xmm0, xmm1
+ movdqa xmm6, xmm4
+ paddd xmm1, xmm2 ; xmm1=data3L
+ paddd xmm4, xmm7 ; xmm4=data3H
+ psubd xmm0, xmm2 ; xmm0=data4L
+ psubd xmm6, xmm7 ; xmm6=data4H
+
+ movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
+
+ paddd xmm1, xmm2
+ paddd xmm4, xmm2
+ psrad xmm1, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm0, xmm2
+ paddd xmm6, xmm2
+ psrad xmm0, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
+ packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
+ movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
+
+ movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
+ punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
+ movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
+
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; -- Even part
+
+ ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm6, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm6, xmm2 ; xmm2=in6=z3
+ punpckhwd xmm5, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm2, xmm5
+ pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L
+ pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
+ pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H
+
+ movdqa xmm3, xmm7
+ paddw xmm7, xmm0 ; xmm7=in0+in4
+ psubw xmm3, xmm0 ; xmm3=in0-in4
+
+ pxor xmm4, xmm4
+ pxor xmm0, xmm0
+ punpcklwd xmm4, xmm7 ; xmm4=tmp0L
+ punpckhwd xmm0, xmm7 ; xmm0=tmp0H
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+ psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm6 ; xmm4=tmp10L
+ psubd xmm7, xmm6 ; xmm7=tmp13L
+ movdqa xmm6, xmm0
+ paddd xmm0, xmm5 ; xmm0=tmp10H
+ psubd xmm6, xmm5 ; xmm6=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm4, xmm4
+ punpcklwd xmm5, xmm3 ; xmm5=tmp1L
+ punpckhwd xmm4, xmm3 ; xmm4=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+ movdqa xmm0, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm0, xmm1 ; xmm0=tmp12L
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm2 ; xmm4=tmp11H
+ psubd xmm7, xmm2 ; xmm7=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
+ movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
+ movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
+ movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
+
+ movdqa xmm5, xmm6
+ movdqa xmm4, xmm3
+ paddw xmm5, xmm1 ; xmm5=z3
+ paddw xmm4, xmm2 ; xmm4=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm5
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm7, xmm4
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm7
+ pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L
+ pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H
+ pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
+ pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm0, xmm1
+ movdqa xmm7, xmm1
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm7
+ pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L
+ pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H
+ pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L
+ pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H
+
+ paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
+ paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
+ paddd xmm1, xmm5 ; xmm1=tmp3L
+ paddd xmm3, xmm4 ; xmm3=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
+
+ movdqa xmm0, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm7, xmm6
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm7
+ pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L
+ pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H
+ pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L
+ pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
+
+ paddd xmm0, xmm5 ; xmm0=tmp1L
+ paddd xmm7, xmm4 ; xmm7=tmp1H
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm4
+ paddd xmm5, xmm1 ; xmm5=data0L
+ paddd xmm4, xmm3 ; xmm4=data0H
+ psubd xmm0, xmm1 ; xmm0=data7L
+ psubd xmm7, xmm3 ; xmm7=data7H
+
+ movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
+
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrad xmm5, DESCALE_P2
+ psrad xmm4, DESCALE_P2
+ paddd xmm0, xmm1
+ paddd xmm7, xmm1
+ psrad xmm0, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
+ packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
+ movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
+
+ movdqa xmm4, xmm3
+ movdqa xmm7, xmm1
+ paddd xmm3, xmm2 ; xmm3=data1L
+ paddd xmm1, xmm6 ; xmm1=data1H
+ psubd xmm4, xmm2 ; xmm4=data6L
+ psubd xmm7, xmm6 ; xmm7=data6H
+
+ movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
+
+ paddd xmm3, xmm2
+ paddd xmm1, xmm2
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm4, xmm2
+ paddd xmm7, xmm2
+ psrad xmm4, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+ packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
+ movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
+ movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm4, xmm6
+ movdqa xmm0, xmm2
+ paddd xmm6, xmm1 ; xmm6=data2L
+ paddd xmm2, xmm7 ; xmm2=data2H
+ psubd xmm4, xmm1 ; xmm4=data5L
+ psubd xmm0, xmm7 ; xmm0=data5H
+
+ movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
+
+ paddd xmm6, xmm5
+ paddd xmm2, xmm5
+ psrad xmm6, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm4, xmm5
+ paddd xmm0, xmm5
+ psrad xmm4, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
+ packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+ movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
+ movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
+ movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
+ movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm1
+ paddd xmm3, xmm7 ; xmm3=data3L
+ paddd xmm1, xmm5 ; xmm1=data3H
+ psubd xmm2, xmm7 ; xmm2=data4L
+ psubd xmm0, xmm5 ; xmm0=data4H
+
+ movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
+
+ paddd xmm3, xmm7
+ paddd xmm1, xmm7
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm2, xmm7
+ paddd xmm0, xmm7
+ psrad xmm2, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
+
+ packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
+ packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm7, xmm5
+ paddb xmm1, xmm5
+ paddb xmm6, xmm5
+ paddb xmm3, xmm5
+
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
+ punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+ mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+ mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctred-sse2.asm b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
new file mode 100644
index 0000000000..4ece9d891c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
@@ -0,0 +1,574 @@
+;
+; jidctred.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076 times 4 dw F_1_847, -F_0_765
+PW_F256_F089 times 4 dw F_2_562, F_0_899
+PW_F106_MF217 times 4 dw F_1_061, -F_2_172
+PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509
+PW_F145_MF021 times 4 dw F_1_451, -F_0_211
+PW_F362_MF127 times 4 dw F_3_624, -F_1_272
+PW_F085_MF072 times 4 dw F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp rbp + 0
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, xmm1
+ packsswb xmm0, xmm0
+ packsswb xmm0, xmm0
+ movd eax, xmm0
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm0, PASS1_BITS
+
+ movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+ pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+ pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+ pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm0
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L)
+ pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H)
+ pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L)
+ pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H)
+
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm6, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L)
+ pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H)
+ pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L)
+ pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H)
+
+ paddd xmm6, xmm4 ; xmm6=tmp2L
+ paddd xmm7, xmm5 ; xmm7=tmp2H
+ paddd xmm2, xmm0 ; xmm2=tmp0L
+ paddd xmm3, xmm1 ; xmm3=tmp0H
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ punpcklwd xmm1, xmm4 ; xmm1=tmp0L
+ punpckhwd xmm2, xmm4 ; xmm2=tmp0H
+ psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+ psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+ movdqa xmm3, xmm5 ; xmm5=in2=z2
+ punpcklwd xmm5, xmm0 ; xmm0=in6=z3
+ punpckhwd xmm3, xmm0
+ pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L
+ pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H
+
+ movdqa xmm4, xmm1
+ movdqa xmm0, xmm2
+ paddd xmm1, xmm5 ; xmm1=tmp10L
+ paddd xmm2, xmm3 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp12L
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ ; -- Final output stage
+
+ movdqa xmm5, xmm1
+ movdqa xmm3, xmm2
+ paddd xmm1, xmm6 ; xmm1=data0L
+ paddd xmm2, xmm7 ; xmm2=data0H
+ psubd xmm5, xmm6 ; xmm5=data3L
+ psubd xmm3, xmm7 ; xmm3=data3H
+
+ movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
+
+ paddd xmm1, xmm6
+ paddd xmm2, xmm6
+ psrad xmm1, DESCALE_P1_4
+ psrad xmm2, DESCALE_P1_4
+ paddd xmm5, xmm6
+ paddd xmm3, xmm6
+ psrad xmm5, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
+
+ movdqa xmm2, xmm4
+ movdqa xmm3, xmm0
+ paddd xmm4, xmm7 ; xmm4=data1L
+ paddd xmm0, xmm6 ; xmm0=data1H
+ psubd xmm2, xmm7 ; xmm2=data2L
+ psubd xmm3, xmm6 ; xmm3=data2H
+
+ movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
+
+ paddd xmm4, xmm7
+ paddd xmm0, xmm7
+ psrad xmm4, DESCALE_P1_4
+ psrad xmm0, DESCALE_P1_4
+ paddd xmm2, xmm7
+ paddd xmm3, xmm7
+ psrad xmm2, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; -- Even part
+
+ pxor xmm4, xmm4
+ punpcklwd xmm4, xmm1 ; xmm4=tmp0
+ psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+ ; -- Odd part
+
+ punpckhwd xmm1, xmm0
+ punpckhwd xmm6, xmm3
+ movdqa xmm5, xmm1
+ movdqa xmm2, xmm6
+ pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2)
+ pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2)
+ pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0)
+ pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0)
+
+ paddd xmm6, xmm1 ; xmm6=tmp2
+ paddd xmm2, xmm5 ; xmm2=tmp0
+
+ ; -- Even part
+
+ punpcklwd xmm0, xmm3
+ pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm0 ; xmm4=tmp10
+ psubd xmm7, xmm0 ; xmm7=tmp12
+
+ ; -- Final output stage
+
+ movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
+
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm7
+ paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
+ paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
+ psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
+ psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
+
+ paddd xmm4, xmm1
+ paddd xmm7, xmm1
+ psrad xmm4, DESCALE_P2_4
+ psrad xmm7, DESCALE_P2_4
+ paddd xmm5, xmm1
+ paddd xmm3, xmm1
+ psrad xmm5, DESCALE_P2_4
+ psrad xmm3, DESCALE_P2_4
+
+ packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
+ packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
+
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
+ punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
+
+ packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+ paddb xmm4, [rel PB_CENTERJSAMP]
+
+ pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+ pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+ pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+ mov rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+ movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+ ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+ pcmpeqd xmm7, xmm7
+ pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+ movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
+ movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
+ punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
+ punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
+ pmaddwd xmm4, [rel PW_F362_MF127]
+ pmaddwd xmm5, [rel PW_F085_MF072]
+
+ psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
+ pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+ psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
+ pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+ por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
+ por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
+ pmaddwd xmm0, [rel PW_F362_MF127]
+ pmaddwd xmm2, [rel PW_F085_MF072]
+
+ paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
+ paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
+
+ ; -- Even part
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+ movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
+ pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
+ pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+ psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+ psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+ ; -- Final output stage
+
+ movdqa xmm3, xmm6
+ movdqa xmm5, xmm1
+ paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+ paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+ psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+ psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+ movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
+
+ punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
+
+ movdqa xmm7, xmm1
+ punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
+ punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
+
+ paddd xmm6, xmm2
+ psrad xmm6, DESCALE_P1_2
+
+ paddd xmm1, xmm2
+ paddd xmm7, xmm2
+ psrad xmm1, DESCALE_P1_2
+ psrad xmm7, DESCALE_P1_2
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
+
+ ; -- Odd part
+
+ packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+ packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+ pmaddwd xmm1, [rel PW_F362_MF127]
+ pmaddwd xmm7, [rel PW_F085_MF072]
+
+ paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
+
+ ; -- Even part
+
+ pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
+
+ ; -- Final output stage
+
+ movdqa xmm4, xmm6
+ paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+ psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+ punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
+
+ paddd xmm6, [rel PD_DESCALE_P2_2]
+ psrad xmm6, DESCALE_P2_2
+
+ packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+ packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+ paddb xmm6, [rel PB_CENTERJSAMP]
+
+ pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
+ pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov word [rdx+rax*SIZEOF_JSAMPLE], bx
+ mov word [rsi+rax*SIZEOF_JSAMPLE], cx
+
+ pop rbx
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jquantf-sse2.asm b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
new file mode 100644
index 0000000000..ab2e3954f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
@@ -0,0 +1,155 @@
+;
+; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+ push rbx
+
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7
+ packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov rsi, r10
+ mov eax, r11d
+ mov rdi, r12
+ mov rcx, DCTSIZE/2
+.convloop:
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+ psubb xmm0, xmm7 ; xmm0=(01234567)
+ psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
+
+ punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
+ punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
+
+ punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
+ punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
+ punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
+ punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
+
+ psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
+ psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
+ cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
+ cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
+ psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
+ psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
+ cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
+ cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+ add rsi, byte 2*SIZEOF_JSAMPROW
+ add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz short .convloop
+
+ pop rbx
+ uncollect_args 3
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+; FAST_FLOAT *workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT *divisors
+; r12 = FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+
+ mov rsi, r12
+ mov rdx, r11
+ mov rdi, r10
+ mov rax, DCTSIZE2/16
+.quantloop:
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+ add rsi, byte 16*SIZEOF_FAST_FLOAT
+ add rdx, byte 16*SIZEOF_FAST_FLOAT
+ add rdi, byte 16*SIZEOF_JCOEF
+ dec rax
+ jnz short .quantloop
+
+ uncollect_args 3
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jquanti-avx2.asm b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
new file mode 100644
index 0000000000..70fe81139c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
@@ -0,0 +1,163 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+
+ mov eax, r11d
+
+ mov rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+ pinsrq xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+ mov rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+ pinsrq xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+ mov rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+ pinsrq xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+ mov rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+ pinsrq xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+ vpmovzxbw ymm0, xmm0 ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ vpmovzxbw ymm1, xmm1 ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ vpmovzxbw ymm2, xmm2 ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ vpmovzxbw ymm3, xmm3 ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ vpcmpeqw ymm7, ymm7, ymm7
+ vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm1, ymm1, ymm7
+ vpaddw ymm2, ymm2, ymm7
+ vpaddw ymm3, ymm3, ymm7
+
+ vmovdqu YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
+ vmovdqu YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
+ vmovdqu YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
+ vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
+
+ vzeroupper
+ uncollect_args 3
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+
+ vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+ vpabsw ymm0, ymm4
+ vpabsw ymm1, ymm5
+ vpabsw ymm2, ymm6
+ vpabsw ymm3, ymm7
+
+ vpaddw ymm0, YMMWORD [CORRECTION(0,0,r11)] ; correction + roundfactor
+ vpaddw ymm1, YMMWORD [CORRECTION(2,0,r11)]
+ vpaddw ymm2, YMMWORD [CORRECTION(4,0,r11)]
+ vpaddw ymm3, YMMWORD [CORRECTION(6,0,r11)]
+ vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,r11)] ; reciprocal
+ vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
+ vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
+ vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
+ vpmulhuw ymm0, YMMWORD [SCALE(0,0,r11)] ; scale
+ vpmulhuw ymm1, YMMWORD [SCALE(2,0,r11)]
+ vpmulhuw ymm2, YMMWORD [SCALE(4,0,r11)]
+ vpmulhuw ymm3, YMMWORD [SCALE(6,0,r11)]
+
+ vpsignw ymm0, ymm0, ymm4
+ vpsignw ymm1, ymm1, ymm5
+ vpsignw ymm2, ymm2, ymm6
+ vpsignw ymm3, ymm3, ymm7
+
+ vmovdqu [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+ vmovdqu [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+ vmovdqu [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+ vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+
+ vzeroupper
+ uncollect_args 3
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jquanti-sse2.asm b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
new file mode 100644
index 0000000000..3ee442027a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+ push rbx
+
+ pxor xmm6, xmm6 ; xmm6=(all 0's)
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ mov rsi, r10
+ mov eax, r11d
+ mov rdi, r12
+ mov rcx, DCTSIZE/4
+.convloop:
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
+ movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
+
+ mov rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
+ movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
+
+ punpcklbw xmm0, xmm6 ; xmm0=(01234567)
+ punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
+ punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
+ paddw xmm2, xmm7
+ paddw xmm3, xmm7
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+ add rsi, byte 4*SIZEOF_JSAMPROW
+ add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec rcx
+ jnz short .convloop
+
+ pop rbx
+ uncollect_args 3
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+
+ mov rsi, r12
+ mov rdx, r11
+ mov rdi, r10
+ mov rax, DCTSIZE2/32
+.quantloop:
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ psraw xmm4, (WORD_BIT-1)
+ psraw xmm5, (WORD_BIT-1)
+ psraw xmm6, (WORD_BIT-1)
+ psraw xmm7, (WORD_BIT-1)
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
+ psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
+ psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
+ psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
+
+ paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
+ paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+ paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+ paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+ pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
+ pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+ pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+ pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+ pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
+ pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
+ pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
+ pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+ psubw xmm2, xmm6
+ psubw xmm3, xmm7
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+ add rsi, byte 32*SIZEOF_DCTELEM
+ add rdx, byte 32*SIZEOF_DCTELEM
+ add rdi, byte 32*SIZEOF_JCOEF
+ dec rax
+ jnz near .quantloop
+
+ uncollect_args 3
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jsimd.c b/media/libjpeg/simd/x86_64/jsimd.c
new file mode 100644
index 0000000000..584a010ad3
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimd.c
@@ -0,0 +1,1068 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jconfigint.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = (unsigned int)(~0);
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+ simd_support &= JSIMD_SSE2;
+ if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+ simd_support &= JSIMD_AVX2;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_extrgb_ycc_convert_avx2;
+ sse2fct = jsimd_extrgb_ycc_convert_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+ sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_extbgr_ycc_convert_avx2;
+ sse2fct = jsimd_extbgr_ycc_convert_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+ sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+ sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+ sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+ break;
+ default:
+ avx2fct = jsimd_rgb_ycc_convert_avx2;
+ sse2fct = jsimd_rgb_ycc_convert_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_extrgb_gray_convert_avx2;
+ sse2fct = jsimd_extrgb_gray_convert_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_extrgbx_gray_convert_avx2;
+ sse2fct = jsimd_extrgbx_gray_convert_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_extbgr_gray_convert_avx2;
+ sse2fct = jsimd_extbgr_gray_convert_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_extbgrx_gray_convert_avx2;
+ sse2fct = jsimd_extbgrx_gray_convert_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_extxbgr_gray_convert_avx2;
+ sse2fct = jsimd_extxbgr_gray_convert_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_extxrgb_gray_convert_avx2;
+ sse2fct = jsimd_extxrgb_gray_convert_sse2;
+ break;
+ default:
+ avx2fct = jsimd_rgb_gray_convert_avx2;
+ sse2fct = jsimd_rgb_gray_convert_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_ycc_extrgb_convert_avx2;
+ sse2fct = jsimd_ycc_extrgb_convert_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+ sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_ycc_extbgr_convert_avx2;
+ sse2fct = jsimd_ycc_extbgr_convert_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+ sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+ sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+ sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+ break;
+ default:
+ avx2fct = jsimd_ycc_rgb_convert_avx2;
+ sse2fct = jsimd_ycc_rgb_convert_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+ else
+ sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else
+ jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else
+ jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else
+ jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else
+ jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else
+ jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else
+ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+ break;
+ default:
+ avx2fct = jsimd_h2v2_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_merged_upsample_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+ break;
+ default:
+ avx2fct = jsimd_h2v1_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_merged_upsample_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_convsamp_avx2(sample_data, start_col, workspace);
+ else
+ jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+ jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_fdct_islow_avx2(data);
+ else
+ jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+ jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_quantize_avx2(coef_block, divisors, workspace);
+ else
+ jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+ jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+ if (sizeof(FLOAT_MULT_TYPE) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if (simd_support & JSIMD_AVX2)
+ jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+ IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/x86_64/jsimdcpu.asm b/media/libjpeg/simd/x86_64/jsimdcpu.asm
new file mode 100644
index 0000000000..705f813d7d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimdcpu.asm
@@ -0,0 +1,86 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+ align 32
+ GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+ push rbx
+ push rdi
+
+ xor rdi, rdi ; simd support flag
+
+ ; Assume that all x86-64 processors support SSE & SSE2 instructions
+ or rdi, JSIMD_SSE2
+ or rdi, JSIMD_SSE
+
+ ; Check whether CPUID leaf 07H is supported
+ ; (leaf 07H is used to check for AVX2 instruction support)
+ mov rax, 0
+ cpuid
+ cmp rax, 7
+ jl short .return ; Maximum leaf < 07H
+
+ ; Check for AVX2 instruction support
+ mov rax, 7
+ xor rcx, rcx
+ cpuid
+ mov rax, rbx ; rax = Extended feature flags
+
+ test rax, 1<<5 ; bit5:AVX2
+ jz short .return
+
+ ; Check for AVX2 O/S support
+ mov rax, 1
+ xor rcx, rcx
+ cpuid
+ test rcx, 1<<27
+ jz short .return ; O/S does not support XSAVE
+ test rcx, 1<<28
+ jz short .return ; CPU does not support AVX2
+
+ xor rcx, rcx
+ xgetbv
+ and rax, 6
+ cmp rax, 6 ; O/S does not manage XMM/YMM state
+ ; using XSAVE
+ jnz short .return
+
+ or rdi, JSIMD_AVX2
+
+.return:
+ mov rax, rdi
+
+ pop rdi
+ pop rbx
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libogg/CHANGES b/media/libogg/CHANGES
index 3f2e0fb261..48f671e59b 100644
--- a/media/libogg/CHANGES
+++ b/media/libogg/CHANGES
@@ -1,3 +1,27 @@
+Version 1.3.5 (2020 June 3)
+
+ * Fix unsigned typedef problem on macOS.
+ * Fix overflow check in ogg_sync_buffer.
+ * Clean up cmake and autotools build files.
+ * Remove Symbian and Apple XCode build files.
+ * Fix documentation cross-reference links.
+
+Version 1.3.4 (2019 August 30)
+
+* Faster slice-by-8 CRC32 implementation.
+ see https://lwn.net/Articles/453931/ for motivation.
+* Add CMake build.
+* Deprecate Visual Studio project files in favor of CMake.
+* configure --disable-crc option for fuzzing.
+* Various build fixes.
+* Documentation and example code fixes.
+
+Version 1.3.3 (2017 November 7)
+
+ * Fix an issue with corrupt continued packet handling.
+ * Update Windows projects and build settings.
+ * Remove Mac OS 9 build support.
+
Version 1.3.2 (2014 May 27)
* Fix an bug in oggpack_writecopy().
@@ -76,7 +100,7 @@ Version 1.1 (2003 November 17)
* big-endian bitpacker routines for Theora
* various portability fixes
- * improved API documenation
+ * improved API documentation
* RFC 3533 documentation of the format by Silvia Pfeiffer at CSIRO
* RFC 3534 documentation of the application/ogg mime-type by Linus Walleij
diff --git a/media/libogg/README b/media/libogg/README
deleted file mode 100644
index 2db22e65f4..0000000000
--- a/media/libogg/README
+++ /dev/null
@@ -1,97 +0,0 @@
-********************************************************************
-* *
-* THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE. *
-* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
-* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
-* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
-* *
-* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2011 *
-* by the Xiph.Org Foundation http://www.xiph.org/ *
-* *
-********************************************************************
-
-= WHAT'S HERE =
-
-This source distribution includes libogg and nothing else. Other modules
-(eg, the modules libvorbis, vorbis-tools for the Vorbis music codec,
-libtheora for the Theora video codec) contain the codec libraries for
-use with Ogg bitstreams.
-
-Directory:
-
-./src The source for libogg, a BSD-license inplementation of
- the public domain Ogg bitstream format
-
-./include Library API headers
-
-./doc Ogg specification and libogg API documents
-
-./win32 Win32 projects and build automation
-
-./macosx Mac OS X project and build files
-
-= WHAT IS OGG? =
-
-Ogg project codecs use the Ogg bitstream format to arrange the raw,
-compressed bitstream into a more robust, useful form. For example,
-the Ogg bitstream makes seeking, time stamping and error recovery
-possible, as well as mixing several sepearate, concurrent media
-streams into a single physical bitstream.
-
-= CONTACT =
-
-The Ogg homepage is located at 'https://www.xiph.org/ogg/'.
-Up to date technical documents, contact information, source code and
-pre-built utilities may be found there.
-
-BUILDING FROM TARBALL DISTRIBUTIONS:
-
-./configure
-make
-
-and optionally (as root):
-make install
-
-This will install the Ogg libraries (static and shared) into
-/usr/local/lib, includes into /usr/local/include and API
-documentation into /usr/local/share/doc.
-
-BUILDING FROM REPOSITORY SOURCE:
-
-A standard svn build should consist of nothing more than:
-
-./autogen.sh
-make
-
-and as root if desired :
-
-make install
-
-BUILDING ON WIN32:
-
-Use the project file in the win32 directory. It should compile out of the box.
-
-CROSS COMPILING FROM LINUX TO WIN32:
-
-It is also possible to cross compile from Linux to windows using the MinGW
-cross tools and even to run the test suite under Wine, the Linux/*nix
-windows emulator.
-
-On Debian and Ubuntu systems, these cross compiler tools can be installed
-by doing:
-
- sudo apt-get mingw32 mingw32-binutils mingw32-runtime wine
-
-Once these tools are installed its possible to compile and test by
-executing the following commands, or something similar depending on
-your system:
-
- ./configure --host=i586-mingw32msvc --target=i586-mingw32msvc \
- --build=i586-linux
- make
- make check
-
-(Build instructions for Ogg codecs such as vorbis are similar and may
-be found in those source modules' README files)
-
-$Id: README 18096 2011-09-22 23:32:51Z giles $
diff --git a/media/libogg/README.md b/media/libogg/README.md
new file mode 100644
index 0000000000..0101cb14dc
--- /dev/null
+++ b/media/libogg/README.md
@@ -0,0 +1,160 @@
+# Ogg
+
+[![Travis Build Status](https://travis-ci.org/xiph/ogg.svg?branch=master)](https://travis-ci.org/xiph/ogg)
+[![Jenkins Build Status](https://mf4.xiph.org/jenkins/job/libogg/badge/icon)](https://mf4.xiph.org/jenkins/job/libogg/)
+[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/xiph/ogg?branch=master&svg=true)](https://ci.appveyor.com/project/rillian/ogg)
+
+Ogg project codecs use the Ogg bitstream format to arrange the raw,
+compressed bitstream into a more robust, useful form. For example,
+the Ogg bitstream makes seeking, time stamping and error recovery
+possible, as well as mixing several sepearate, concurrent media
+streams into a single physical bitstream.
+
+## What's here ##
+This source distribution includes libogg and nothing else. Other modules
+(eg, the modules libvorbis, vorbis-tools for the Vorbis music codec,
+libtheora for the Theora video codec) contain the codec libraries for
+use with Ogg bitstreams.
+
+Directory:
+
+- `src` The source for libogg, a BSD-license inplementation of the public domain Ogg bitstream format
+
+- `include` Library API headers
+
+- `doc` Ogg specification and libogg API documents
+
+- `win32` Win32 projects and build automation
+
+## Contact ##
+
+The Ogg homepage is located at https://www.xiph.org/ogg/ .
+Up to date technical documents, contact information, source code and
+pre-built utilities may be found there.
+
+## Building ##
+
+#### Building from tarball distributions ####
+
+ ./configure
+ make
+
+and optionally (as root):
+
+ make install
+
+This will install the Ogg libraries (static and shared) into
+/usr/local/lib, includes into /usr/local/include and API
+documentation into /usr/local/share/doc.
+
+#### Building from repository source ####
+
+A standard svn build should consist of nothing more than:
+
+ ./autogen.sh
+ ./configure
+ make
+
+and as root if desired :
+
+ make install
+
+#### Building on Windows ####
+
+Use the project file in the win32 directory. It should compile out of the box.
+
+#### Cross-compiling from Linux to Windows ####
+
+It is also possible to cross compile from Linux to windows using the MinGW
+cross tools and even to run the test suite under Wine, the Linux/*nix
+windows emulator.
+
+On Debian and Ubuntu systems, these cross compiler tools can be installed
+by doing:
+
+ sudo apt-get mingw32 mingw32-binutils mingw32-runtime wine
+
+Once these tools are installed its possible to compile and test by
+executing the following commands, or something similar depending on
+your system:
+
+ ./configure --host=i586-mingw32msvc --target=i586-mingw32msvc --build=i586-linux
+ make
+ make check
+
+(Build instructions for Ogg codecs such as vorbis are similar and may
+be found in those source modules' README files)
+
+## Building with CMake ##
+
+Ogg supports building using [CMake](http://www.cmake.org/). CMake is a meta build system that generates native projects for each platform.
+To generate projects just run cmake replacing `YOUR-PROJECT-GENERATOR` with a proper generator from a list [here](http://www.cmake.org/cmake/help/v3.2/manual/cmake-generators.7.html):
+
+ mkdir build
+ cd build
+ cmake -G YOUR-PROJECT-GENERATOR ..
+
+Note that by default cmake generates projects that will build static libraries.
+To generate projects that will build dynamic library use `BUILD_SHARED_LIBS` option like this:
+
+ cmake -G YOUR-PROJECT-GENERATOR -DBUILD_SHARED_LIBS=1 ..
+
+After projects are generated use them as usual
+
+#### Building on Windows ####
+
+Use proper generator for your Visual Studio version like:
+
+ cmake -G "Visual Studio 12 2013" ..
+
+#### Building on Mac OS X ####
+
+Use Xcode generator. To build framework run:
+
+ cmake -G Xcode -DBUILD_FRAMEWORK=1 ..
+
+#### Building on Linux ####
+
+Use Makefile generator which is default one.
+
+ cmake ..
+ make
+
+## Testing ##
+
+This package includes a collection of automated tests.
+Running them is not part of building nor installation but optional.
+
+### Unix-like System or MinGW ###
+
+If build under automake:
+
+ make check
+
+If build under CMake:
+
+ make test
+
+or:
+
+ ctest
+
+### Windows with MSBuild ###
+
+If build with configuration type "Debug", then:
+
+ ctest -C Debug
+
+If build with configuration type "Release", then:
+
+ ctest -C Release
+
+## License ##
+
+THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.
+USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS
+GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE
+IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.
+
+THE OggVorbis SOURCE CODE IS COPYRIGHT (C) 1994-2019
+by the Xiph.Org Foundation https://www.xiph.org/
diff --git a/media/libogg/README_MOZILLA b/media/libogg/README_MOZILLA
index 6213fdc777..7b10a7c19e 100644
--- a/media/libogg/README_MOZILLA
+++ b/media/libogg/README_MOZILLA
@@ -1,4 +1,4 @@
-Version: 1.3.2
+Version: 1.3.5
The source from this directory was extracted from the official source
package downloaded from xiph.org and copied using the update.sh script.
diff --git a/media/libogg/include/crctable.h b/media/libogg/include/crctable.h
new file mode 100644
index 0000000000..dcc378b309
--- /dev/null
+++ b/media/libogg/include/crctable.h
@@ -0,0 +1,278 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE Ogg CONTAINER SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2018 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************/
+
+#include <ogg/os_types.h>
+
+static const ogg_uint32_t crc_lookup[8][256]={
+{0x00000000,0x04c11db7,0x09823b6e,0x0d4326d9,0x130476dc,0x17c56b6b,0x1a864db2,0x1e475005,
+ 0x2608edb8,0x22c9f00f,0x2f8ad6d6,0x2b4bcb61,0x350c9b64,0x31cd86d3,0x3c8ea00a,0x384fbdbd,
+ 0x4c11db70,0x48d0c6c7,0x4593e01e,0x4152fda9,0x5f15adac,0x5bd4b01b,0x569796c2,0x52568b75,
+ 0x6a1936c8,0x6ed82b7f,0x639b0da6,0x675a1011,0x791d4014,0x7ddc5da3,0x709f7b7a,0x745e66cd,
+ 0x9823b6e0,0x9ce2ab57,0x91a18d8e,0x95609039,0x8b27c03c,0x8fe6dd8b,0x82a5fb52,0x8664e6e5,
+ 0xbe2b5b58,0xbaea46ef,0xb7a96036,0xb3687d81,0xad2f2d84,0xa9ee3033,0xa4ad16ea,0xa06c0b5d,
+ 0xd4326d90,0xd0f37027,0xddb056fe,0xd9714b49,0xc7361b4c,0xc3f706fb,0xceb42022,0xca753d95,
+ 0xf23a8028,0xf6fb9d9f,0xfbb8bb46,0xff79a6f1,0xe13ef6f4,0xe5ffeb43,0xe8bccd9a,0xec7dd02d,
+ 0x34867077,0x30476dc0,0x3d044b19,0x39c556ae,0x278206ab,0x23431b1c,0x2e003dc5,0x2ac12072,
+ 0x128e9dcf,0x164f8078,0x1b0ca6a1,0x1fcdbb16,0x018aeb13,0x054bf6a4,0x0808d07d,0x0cc9cdca,
+ 0x7897ab07,0x7c56b6b0,0x71159069,0x75d48dde,0x6b93dddb,0x6f52c06c,0x6211e6b5,0x66d0fb02,
+ 0x5e9f46bf,0x5a5e5b08,0x571d7dd1,0x53dc6066,0x4d9b3063,0x495a2dd4,0x44190b0d,0x40d816ba,
+ 0xaca5c697,0xa864db20,0xa527fdf9,0xa1e6e04e,0xbfa1b04b,0xbb60adfc,0xb6238b25,0xb2e29692,
+ 0x8aad2b2f,0x8e6c3698,0x832f1041,0x87ee0df6,0x99a95df3,0x9d684044,0x902b669d,0x94ea7b2a,
+ 0xe0b41de7,0xe4750050,0xe9362689,0xedf73b3e,0xf3b06b3b,0xf771768c,0xfa325055,0xfef34de2,
+ 0xc6bcf05f,0xc27dede8,0xcf3ecb31,0xcbffd686,0xd5b88683,0xd1799b34,0xdc3abded,0xd8fba05a,
+ 0x690ce0ee,0x6dcdfd59,0x608edb80,0x644fc637,0x7a089632,0x7ec98b85,0x738aad5c,0x774bb0eb,
+ 0x4f040d56,0x4bc510e1,0x46863638,0x42472b8f,0x5c007b8a,0x58c1663d,0x558240e4,0x51435d53,
+ 0x251d3b9e,0x21dc2629,0x2c9f00f0,0x285e1d47,0x36194d42,0x32d850f5,0x3f9b762c,0x3b5a6b9b,
+ 0x0315d626,0x07d4cb91,0x0a97ed48,0x0e56f0ff,0x1011a0fa,0x14d0bd4d,0x19939b94,0x1d528623,
+ 0xf12f560e,0xf5ee4bb9,0xf8ad6d60,0xfc6c70d7,0xe22b20d2,0xe6ea3d65,0xeba91bbc,0xef68060b,
+ 0xd727bbb6,0xd3e6a601,0xdea580d8,0xda649d6f,0xc423cd6a,0xc0e2d0dd,0xcda1f604,0xc960ebb3,
+ 0xbd3e8d7e,0xb9ff90c9,0xb4bcb610,0xb07daba7,0xae3afba2,0xaafbe615,0xa7b8c0cc,0xa379dd7b,
+ 0x9b3660c6,0x9ff77d71,0x92b45ba8,0x9675461f,0x8832161a,0x8cf30bad,0x81b02d74,0x857130c3,
+ 0x5d8a9099,0x594b8d2e,0x5408abf7,0x50c9b640,0x4e8ee645,0x4a4ffbf2,0x470cdd2b,0x43cdc09c,
+ 0x7b827d21,0x7f436096,0x7200464f,0x76c15bf8,0x68860bfd,0x6c47164a,0x61043093,0x65c52d24,
+ 0x119b4be9,0x155a565e,0x18197087,0x1cd86d30,0x029f3d35,0x065e2082,0x0b1d065b,0x0fdc1bec,
+ 0x3793a651,0x3352bbe6,0x3e119d3f,0x3ad08088,0x2497d08d,0x2056cd3a,0x2d15ebe3,0x29d4f654,
+ 0xc5a92679,0xc1683bce,0xcc2b1d17,0xc8ea00a0,0xd6ad50a5,0xd26c4d12,0xdf2f6bcb,0xdbee767c,
+ 0xe3a1cbc1,0xe760d676,0xea23f0af,0xeee2ed18,0xf0a5bd1d,0xf464a0aa,0xf9278673,0xfde69bc4,
+ 0x89b8fd09,0x8d79e0be,0x803ac667,0x84fbdbd0,0x9abc8bd5,0x9e7d9662,0x933eb0bb,0x97ffad0c,
+ 0xafb010b1,0xab710d06,0xa6322bdf,0xa2f33668,0xbcb4666d,0xb8757bda,0xb5365d03,0xb1f740b4},
+
+{0x00000000,0xd219c1dc,0xa0f29e0f,0x72eb5fd3,0x452421a9,0x973de075,0xe5d6bfa6,0x37cf7e7a,
+ 0x8a484352,0x5851828e,0x2abadd5d,0xf8a31c81,0xcf6c62fb,0x1d75a327,0x6f9efcf4,0xbd873d28,
+ 0x10519b13,0xc2485acf,0xb0a3051c,0x62bac4c0,0x5575baba,0x876c7b66,0xf58724b5,0x279ee569,
+ 0x9a19d841,0x4800199d,0x3aeb464e,0xe8f28792,0xdf3df9e8,0x0d243834,0x7fcf67e7,0xadd6a63b,
+ 0x20a33626,0xf2baf7fa,0x8051a829,0x524869f5,0x6587178f,0xb79ed653,0xc5758980,0x176c485c,
+ 0xaaeb7574,0x78f2b4a8,0x0a19eb7b,0xd8002aa7,0xefcf54dd,0x3dd69501,0x4f3dcad2,0x9d240b0e,
+ 0x30f2ad35,0xe2eb6ce9,0x9000333a,0x4219f2e6,0x75d68c9c,0xa7cf4d40,0xd5241293,0x073dd34f,
+ 0xbabaee67,0x68a32fbb,0x1a487068,0xc851b1b4,0xff9ecfce,0x2d870e12,0x5f6c51c1,0x8d75901d,
+ 0x41466c4c,0x935fad90,0xe1b4f243,0x33ad339f,0x04624de5,0xd67b8c39,0xa490d3ea,0x76891236,
+ 0xcb0e2f1e,0x1917eec2,0x6bfcb111,0xb9e570cd,0x8e2a0eb7,0x5c33cf6b,0x2ed890b8,0xfcc15164,
+ 0x5117f75f,0x830e3683,0xf1e56950,0x23fca88c,0x1433d6f6,0xc62a172a,0xb4c148f9,0x66d88925,
+ 0xdb5fb40d,0x094675d1,0x7bad2a02,0xa9b4ebde,0x9e7b95a4,0x4c625478,0x3e890bab,0xec90ca77,
+ 0x61e55a6a,0xb3fc9bb6,0xc117c465,0x130e05b9,0x24c17bc3,0xf6d8ba1f,0x8433e5cc,0x562a2410,
+ 0xebad1938,0x39b4d8e4,0x4b5f8737,0x994646eb,0xae893891,0x7c90f94d,0x0e7ba69e,0xdc626742,
+ 0x71b4c179,0xa3ad00a5,0xd1465f76,0x035f9eaa,0x3490e0d0,0xe689210c,0x94627edf,0x467bbf03,
+ 0xfbfc822b,0x29e543f7,0x5b0e1c24,0x8917ddf8,0xbed8a382,0x6cc1625e,0x1e2a3d8d,0xcc33fc51,
+ 0x828cd898,0x50951944,0x227e4697,0xf067874b,0xc7a8f931,0x15b138ed,0x675a673e,0xb543a6e2,
+ 0x08c49bca,0xdadd5a16,0xa83605c5,0x7a2fc419,0x4de0ba63,0x9ff97bbf,0xed12246c,0x3f0be5b0,
+ 0x92dd438b,0x40c48257,0x322fdd84,0xe0361c58,0xd7f96222,0x05e0a3fe,0x770bfc2d,0xa5123df1,
+ 0x189500d9,0xca8cc105,0xb8679ed6,0x6a7e5f0a,0x5db12170,0x8fa8e0ac,0xfd43bf7f,0x2f5a7ea3,
+ 0xa22feebe,0x70362f62,0x02dd70b1,0xd0c4b16d,0xe70bcf17,0x35120ecb,0x47f95118,0x95e090c4,
+ 0x2867adec,0xfa7e6c30,0x889533e3,0x5a8cf23f,0x6d438c45,0xbf5a4d99,0xcdb1124a,0x1fa8d396,
+ 0xb27e75ad,0x6067b471,0x128ceba2,0xc0952a7e,0xf75a5404,0x254395d8,0x57a8ca0b,0x85b10bd7,
+ 0x383636ff,0xea2ff723,0x98c4a8f0,0x4add692c,0x7d121756,0xaf0bd68a,0xdde08959,0x0ff94885,
+ 0xc3cab4d4,0x11d37508,0x63382adb,0xb121eb07,0x86ee957d,0x54f754a1,0x261c0b72,0xf405caae,
+ 0x4982f786,0x9b9b365a,0xe9706989,0x3b69a855,0x0ca6d62f,0xdebf17f3,0xac544820,0x7e4d89fc,
+ 0xd39b2fc7,0x0182ee1b,0x7369b1c8,0xa1707014,0x96bf0e6e,0x44a6cfb2,0x364d9061,0xe45451bd,
+ 0x59d36c95,0x8bcaad49,0xf921f29a,0x2b383346,0x1cf74d3c,0xceee8ce0,0xbc05d333,0x6e1c12ef,
+ 0xe36982f2,0x3170432e,0x439b1cfd,0x9182dd21,0xa64da35b,0x74546287,0x06bf3d54,0xd4a6fc88,
+ 0x6921c1a0,0xbb38007c,0xc9d35faf,0x1bca9e73,0x2c05e009,0xfe1c21d5,0x8cf77e06,0x5eeebfda,
+ 0xf33819e1,0x2121d83d,0x53ca87ee,0x81d34632,0xb61c3848,0x6405f994,0x16eea647,0xc4f7679b,
+ 0x79705ab3,0xab699b6f,0xd982c4bc,0x0b9b0560,0x3c547b1a,0xee4dbac6,0x9ca6e515,0x4ebf24c9},
+
+{0x00000000,0x01d8ac87,0x03b1590e,0x0269f589,0x0762b21c,0x06ba1e9b,0x04d3eb12,0x050b4795,
+ 0x0ec56438,0x0f1dc8bf,0x0d743d36,0x0cac91b1,0x09a7d624,0x087f7aa3,0x0a168f2a,0x0bce23ad,
+ 0x1d8ac870,0x1c5264f7,0x1e3b917e,0x1fe33df9,0x1ae87a6c,0x1b30d6eb,0x19592362,0x18818fe5,
+ 0x134fac48,0x129700cf,0x10fef546,0x112659c1,0x142d1e54,0x15f5b2d3,0x179c475a,0x1644ebdd,
+ 0x3b1590e0,0x3acd3c67,0x38a4c9ee,0x397c6569,0x3c7722fc,0x3daf8e7b,0x3fc67bf2,0x3e1ed775,
+ 0x35d0f4d8,0x3408585f,0x3661add6,0x37b90151,0x32b246c4,0x336aea43,0x31031fca,0x30dbb34d,
+ 0x269f5890,0x2747f417,0x252e019e,0x24f6ad19,0x21fdea8c,0x2025460b,0x224cb382,0x23941f05,
+ 0x285a3ca8,0x2982902f,0x2beb65a6,0x2a33c921,0x2f388eb4,0x2ee02233,0x2c89d7ba,0x2d517b3d,
+ 0x762b21c0,0x77f38d47,0x759a78ce,0x7442d449,0x714993dc,0x70913f5b,0x72f8cad2,0x73206655,
+ 0x78ee45f8,0x7936e97f,0x7b5f1cf6,0x7a87b071,0x7f8cf7e4,0x7e545b63,0x7c3daeea,0x7de5026d,
+ 0x6ba1e9b0,0x6a794537,0x6810b0be,0x69c81c39,0x6cc35bac,0x6d1bf72b,0x6f7202a2,0x6eaaae25,
+ 0x65648d88,0x64bc210f,0x66d5d486,0x670d7801,0x62063f94,0x63de9313,0x61b7669a,0x606fca1d,
+ 0x4d3eb120,0x4ce61da7,0x4e8fe82e,0x4f5744a9,0x4a5c033c,0x4b84afbb,0x49ed5a32,0x4835f6b5,
+ 0x43fbd518,0x4223799f,0x404a8c16,0x41922091,0x44996704,0x4541cb83,0x47283e0a,0x46f0928d,
+ 0x50b47950,0x516cd5d7,0x5305205e,0x52dd8cd9,0x57d6cb4c,0x560e67cb,0x54679242,0x55bf3ec5,
+ 0x5e711d68,0x5fa9b1ef,0x5dc04466,0x5c18e8e1,0x5913af74,0x58cb03f3,0x5aa2f67a,0x5b7a5afd,
+ 0xec564380,0xed8eef07,0xefe71a8e,0xee3fb609,0xeb34f19c,0xeaec5d1b,0xe885a892,0xe95d0415,
+ 0xe29327b8,0xe34b8b3f,0xe1227eb6,0xe0fad231,0xe5f195a4,0xe4293923,0xe640ccaa,0xe798602d,
+ 0xf1dc8bf0,0xf0042777,0xf26dd2fe,0xf3b57e79,0xf6be39ec,0xf766956b,0xf50f60e2,0xf4d7cc65,
+ 0xff19efc8,0xfec1434f,0xfca8b6c6,0xfd701a41,0xf87b5dd4,0xf9a3f153,0xfbca04da,0xfa12a85d,
+ 0xd743d360,0xd69b7fe7,0xd4f28a6e,0xd52a26e9,0xd021617c,0xd1f9cdfb,0xd3903872,0xd24894f5,
+ 0xd986b758,0xd85e1bdf,0xda37ee56,0xdbef42d1,0xdee40544,0xdf3ca9c3,0xdd555c4a,0xdc8df0cd,
+ 0xcac91b10,0xcb11b797,0xc978421e,0xc8a0ee99,0xcdaba90c,0xcc73058b,0xce1af002,0xcfc25c85,
+ 0xc40c7f28,0xc5d4d3af,0xc7bd2626,0xc6658aa1,0xc36ecd34,0xc2b661b3,0xc0df943a,0xc10738bd,
+ 0x9a7d6240,0x9ba5cec7,0x99cc3b4e,0x981497c9,0x9d1fd05c,0x9cc77cdb,0x9eae8952,0x9f7625d5,
+ 0x94b80678,0x9560aaff,0x97095f76,0x96d1f3f1,0x93dab464,0x920218e3,0x906bed6a,0x91b341ed,
+ 0x87f7aa30,0x862f06b7,0x8446f33e,0x859e5fb9,0x8095182c,0x814db4ab,0x83244122,0x82fceda5,
+ 0x8932ce08,0x88ea628f,0x8a839706,0x8b5b3b81,0x8e507c14,0x8f88d093,0x8de1251a,0x8c39899d,
+ 0xa168f2a0,0xa0b05e27,0xa2d9abae,0xa3010729,0xa60a40bc,0xa7d2ec3b,0xa5bb19b2,0xa463b535,
+ 0xafad9698,0xae753a1f,0xac1ccf96,0xadc46311,0xa8cf2484,0xa9178803,0xab7e7d8a,0xaaa6d10d,
+ 0xbce23ad0,0xbd3a9657,0xbf5363de,0xbe8bcf59,0xbb8088cc,0xba58244b,0xb831d1c2,0xb9e97d45,
+ 0xb2275ee8,0xb3fff26f,0xb19607e6,0xb04eab61,0xb545ecf4,0xb49d4073,0xb6f4b5fa,0xb72c197d},
+
+{0x00000000,0xdc6d9ab7,0xbc1a28d9,0x6077b26e,0x7cf54c05,0xa098d6b2,0xc0ef64dc,0x1c82fe6b,
+ 0xf9ea980a,0x258702bd,0x45f0b0d3,0x999d2a64,0x851fd40f,0x59724eb8,0x3905fcd6,0xe5686661,
+ 0xf7142da3,0x2b79b714,0x4b0e057a,0x97639fcd,0x8be161a6,0x578cfb11,0x37fb497f,0xeb96d3c8,
+ 0x0efeb5a9,0xd2932f1e,0xb2e49d70,0x6e8907c7,0x720bf9ac,0xae66631b,0xce11d175,0x127c4bc2,
+ 0xeae946f1,0x3684dc46,0x56f36e28,0x8a9ef49f,0x961c0af4,0x4a719043,0x2a06222d,0xf66bb89a,
+ 0x1303defb,0xcf6e444c,0xaf19f622,0x73746c95,0x6ff692fe,0xb39b0849,0xd3ecba27,0x0f812090,
+ 0x1dfd6b52,0xc190f1e5,0xa1e7438b,0x7d8ad93c,0x61082757,0xbd65bde0,0xdd120f8e,0x017f9539,
+ 0xe417f358,0x387a69ef,0x580ddb81,0x84604136,0x98e2bf5d,0x448f25ea,0x24f89784,0xf8950d33,
+ 0xd1139055,0x0d7e0ae2,0x6d09b88c,0xb164223b,0xade6dc50,0x718b46e7,0x11fcf489,0xcd916e3e,
+ 0x28f9085f,0xf49492e8,0x94e32086,0x488eba31,0x540c445a,0x8861deed,0xe8166c83,0x347bf634,
+ 0x2607bdf6,0xfa6a2741,0x9a1d952f,0x46700f98,0x5af2f1f3,0x869f6b44,0xe6e8d92a,0x3a85439d,
+ 0xdfed25fc,0x0380bf4b,0x63f70d25,0xbf9a9792,0xa31869f9,0x7f75f34e,0x1f024120,0xc36fdb97,
+ 0x3bfad6a4,0xe7974c13,0x87e0fe7d,0x5b8d64ca,0x470f9aa1,0x9b620016,0xfb15b278,0x277828cf,
+ 0xc2104eae,0x1e7dd419,0x7e0a6677,0xa267fcc0,0xbee502ab,0x6288981c,0x02ff2a72,0xde92b0c5,
+ 0xcceefb07,0x108361b0,0x70f4d3de,0xac994969,0xb01bb702,0x6c762db5,0x0c019fdb,0xd06c056c,
+ 0x3504630d,0xe969f9ba,0x891e4bd4,0x5573d163,0x49f12f08,0x959cb5bf,0xf5eb07d1,0x29869d66,
+ 0xa6e63d1d,0x7a8ba7aa,0x1afc15c4,0xc6918f73,0xda137118,0x067eebaf,0x660959c1,0xba64c376,
+ 0x5f0ca517,0x83613fa0,0xe3168dce,0x3f7b1779,0x23f9e912,0xff9473a5,0x9fe3c1cb,0x438e5b7c,
+ 0x51f210be,0x8d9f8a09,0xede83867,0x3185a2d0,0x2d075cbb,0xf16ac60c,0x911d7462,0x4d70eed5,
+ 0xa81888b4,0x74751203,0x1402a06d,0xc86f3ada,0xd4edc4b1,0x08805e06,0x68f7ec68,0xb49a76df,
+ 0x4c0f7bec,0x9062e15b,0xf0155335,0x2c78c982,0x30fa37e9,0xec97ad5e,0x8ce01f30,0x508d8587,
+ 0xb5e5e3e6,0x69887951,0x09ffcb3f,0xd5925188,0xc910afe3,0x157d3554,0x750a873a,0xa9671d8d,
+ 0xbb1b564f,0x6776ccf8,0x07017e96,0xdb6ce421,0xc7ee1a4a,0x1b8380fd,0x7bf43293,0xa799a824,
+ 0x42f1ce45,0x9e9c54f2,0xfeebe69c,0x22867c2b,0x3e048240,0xe26918f7,0x821eaa99,0x5e73302e,
+ 0x77f5ad48,0xab9837ff,0xcbef8591,0x17821f26,0x0b00e14d,0xd76d7bfa,0xb71ac994,0x6b775323,
+ 0x8e1f3542,0x5272aff5,0x32051d9b,0xee68872c,0xf2ea7947,0x2e87e3f0,0x4ef0519e,0x929dcb29,
+ 0x80e180eb,0x5c8c1a5c,0x3cfba832,0xe0963285,0xfc14ccee,0x20795659,0x400ee437,0x9c637e80,
+ 0x790b18e1,0xa5668256,0xc5113038,0x197caa8f,0x05fe54e4,0xd993ce53,0xb9e47c3d,0x6589e68a,
+ 0x9d1cebb9,0x4171710e,0x2106c360,0xfd6b59d7,0xe1e9a7bc,0x3d843d0b,0x5df38f65,0x819e15d2,
+ 0x64f673b3,0xb89be904,0xd8ec5b6a,0x0481c1dd,0x18033fb6,0xc46ea501,0xa419176f,0x78748dd8,
+ 0x6a08c61a,0xb6655cad,0xd612eec3,0x0a7f7474,0x16fd8a1f,0xca9010a8,0xaae7a2c6,0x768a3871,
+ 0x93e25e10,0x4f8fc4a7,0x2ff876c9,0xf395ec7e,0xef171215,0x337a88a2,0x530d3acc,0x8f60a07b},
+
+{0x00000000,0x490d678d,0x921acf1a,0xdb17a897,0x20f48383,0x69f9e40e,0xb2ee4c99,0xfbe32b14,
+ 0x41e90706,0x08e4608b,0xd3f3c81c,0x9afeaf91,0x611d8485,0x2810e308,0xf3074b9f,0xba0a2c12,
+ 0x83d20e0c,0xcadf6981,0x11c8c116,0x58c5a69b,0xa3268d8f,0xea2bea02,0x313c4295,0x78312518,
+ 0xc23b090a,0x8b366e87,0x5021c610,0x192ca19d,0xe2cf8a89,0xabc2ed04,0x70d54593,0x39d8221e,
+ 0x036501af,0x4a686622,0x917fceb5,0xd872a938,0x2391822c,0x6a9ce5a1,0xb18b4d36,0xf8862abb,
+ 0x428c06a9,0x0b816124,0xd096c9b3,0x999bae3e,0x6278852a,0x2b75e2a7,0xf0624a30,0xb96f2dbd,
+ 0x80b70fa3,0xc9ba682e,0x12adc0b9,0x5ba0a734,0xa0438c20,0xe94eebad,0x3259433a,0x7b5424b7,
+ 0xc15e08a5,0x88536f28,0x5344c7bf,0x1a49a032,0xe1aa8b26,0xa8a7ecab,0x73b0443c,0x3abd23b1,
+ 0x06ca035e,0x4fc764d3,0x94d0cc44,0xddddabc9,0x263e80dd,0x6f33e750,0xb4244fc7,0xfd29284a,
+ 0x47230458,0x0e2e63d5,0xd539cb42,0x9c34accf,0x67d787db,0x2edae056,0xf5cd48c1,0xbcc02f4c,
+ 0x85180d52,0xcc156adf,0x1702c248,0x5e0fa5c5,0xa5ec8ed1,0xece1e95c,0x37f641cb,0x7efb2646,
+ 0xc4f10a54,0x8dfc6dd9,0x56ebc54e,0x1fe6a2c3,0xe40589d7,0xad08ee5a,0x761f46cd,0x3f122140,
+ 0x05af02f1,0x4ca2657c,0x97b5cdeb,0xdeb8aa66,0x255b8172,0x6c56e6ff,0xb7414e68,0xfe4c29e5,
+ 0x444605f7,0x0d4b627a,0xd65ccaed,0x9f51ad60,0x64b28674,0x2dbfe1f9,0xf6a8496e,0xbfa52ee3,
+ 0x867d0cfd,0xcf706b70,0x1467c3e7,0x5d6aa46a,0xa6898f7e,0xef84e8f3,0x34934064,0x7d9e27e9,
+ 0xc7940bfb,0x8e996c76,0x558ec4e1,0x1c83a36c,0xe7608878,0xae6deff5,0x757a4762,0x3c7720ef,
+ 0x0d9406bc,0x44996131,0x9f8ec9a6,0xd683ae2b,0x2d60853f,0x646de2b2,0xbf7a4a25,0xf6772da8,
+ 0x4c7d01ba,0x05706637,0xde67cea0,0x976aa92d,0x6c898239,0x2584e5b4,0xfe934d23,0xb79e2aae,
+ 0x8e4608b0,0xc74b6f3d,0x1c5cc7aa,0x5551a027,0xaeb28b33,0xe7bfecbe,0x3ca84429,0x75a523a4,
+ 0xcfaf0fb6,0x86a2683b,0x5db5c0ac,0x14b8a721,0xef5b8c35,0xa656ebb8,0x7d41432f,0x344c24a2,
+ 0x0ef10713,0x47fc609e,0x9cebc809,0xd5e6af84,0x2e058490,0x6708e31d,0xbc1f4b8a,0xf5122c07,
+ 0x4f180015,0x06156798,0xdd02cf0f,0x940fa882,0x6fec8396,0x26e1e41b,0xfdf64c8c,0xb4fb2b01,
+ 0x8d23091f,0xc42e6e92,0x1f39c605,0x5634a188,0xadd78a9c,0xe4daed11,0x3fcd4586,0x76c0220b,
+ 0xccca0e19,0x85c76994,0x5ed0c103,0x17dda68e,0xec3e8d9a,0xa533ea17,0x7e244280,0x3729250d,
+ 0x0b5e05e2,0x4253626f,0x9944caf8,0xd049ad75,0x2baa8661,0x62a7e1ec,0xb9b0497b,0xf0bd2ef6,
+ 0x4ab702e4,0x03ba6569,0xd8adcdfe,0x91a0aa73,0x6a438167,0x234ee6ea,0xf8594e7d,0xb15429f0,
+ 0x888c0bee,0xc1816c63,0x1a96c4f4,0x539ba379,0xa878886d,0xe175efe0,0x3a624777,0x736f20fa,
+ 0xc9650ce8,0x80686b65,0x5b7fc3f2,0x1272a47f,0xe9918f6b,0xa09ce8e6,0x7b8b4071,0x328627fc,
+ 0x083b044d,0x413663c0,0x9a21cb57,0xd32cacda,0x28cf87ce,0x61c2e043,0xbad548d4,0xf3d82f59,
+ 0x49d2034b,0x00df64c6,0xdbc8cc51,0x92c5abdc,0x692680c8,0x202be745,0xfb3c4fd2,0xb231285f,
+ 0x8be90a41,0xc2e46dcc,0x19f3c55b,0x50fea2d6,0xab1d89c2,0xe210ee4f,0x390746d8,0x700a2155,
+ 0xca000d47,0x830d6aca,0x581ac25d,0x1117a5d0,0xeaf48ec4,0xa3f9e949,0x78ee41de,0x31e32653},
+
+{0x00000000,0x1b280d78,0x36501af0,0x2d781788,0x6ca035e0,0x77883898,0x5af02f10,0x41d82268,
+ 0xd9406bc0,0xc26866b8,0xef107130,0xf4387c48,0xb5e05e20,0xaec85358,0x83b044d0,0x989849a8,
+ 0xb641ca37,0xad69c74f,0x8011d0c7,0x9b39ddbf,0xdae1ffd7,0xc1c9f2af,0xecb1e527,0xf799e85f,
+ 0x6f01a1f7,0x7429ac8f,0x5951bb07,0x4279b67f,0x03a19417,0x1889996f,0x35f18ee7,0x2ed9839f,
+ 0x684289d9,0x736a84a1,0x5e129329,0x453a9e51,0x04e2bc39,0x1fcab141,0x32b2a6c9,0x299aabb1,
+ 0xb102e219,0xaa2aef61,0x8752f8e9,0x9c7af591,0xdda2d7f9,0xc68ada81,0xebf2cd09,0xf0dac071,
+ 0xde0343ee,0xc52b4e96,0xe853591e,0xf37b5466,0xb2a3760e,0xa98b7b76,0x84f36cfe,0x9fdb6186,
+ 0x0743282e,0x1c6b2556,0x311332de,0x2a3b3fa6,0x6be31dce,0x70cb10b6,0x5db3073e,0x469b0a46,
+ 0xd08513b2,0xcbad1eca,0xe6d50942,0xfdfd043a,0xbc252652,0xa70d2b2a,0x8a753ca2,0x915d31da,
+ 0x09c57872,0x12ed750a,0x3f956282,0x24bd6ffa,0x65654d92,0x7e4d40ea,0x53355762,0x481d5a1a,
+ 0x66c4d985,0x7decd4fd,0x5094c375,0x4bbcce0d,0x0a64ec65,0x114ce11d,0x3c34f695,0x271cfbed,
+ 0xbf84b245,0xa4acbf3d,0x89d4a8b5,0x92fca5cd,0xd32487a5,0xc80c8add,0xe5749d55,0xfe5c902d,
+ 0xb8c79a6b,0xa3ef9713,0x8e97809b,0x95bf8de3,0xd467af8b,0xcf4fa2f3,0xe237b57b,0xf91fb803,
+ 0x6187f1ab,0x7aaffcd3,0x57d7eb5b,0x4cffe623,0x0d27c44b,0x160fc933,0x3b77debb,0x205fd3c3,
+ 0x0e86505c,0x15ae5d24,0x38d64aac,0x23fe47d4,0x622665bc,0x790e68c4,0x54767f4c,0x4f5e7234,
+ 0xd7c63b9c,0xccee36e4,0xe196216c,0xfabe2c14,0xbb660e7c,0xa04e0304,0x8d36148c,0x961e19f4,
+ 0xa5cb3ad3,0xbee337ab,0x939b2023,0x88b32d5b,0xc96b0f33,0xd243024b,0xff3b15c3,0xe41318bb,
+ 0x7c8b5113,0x67a35c6b,0x4adb4be3,0x51f3469b,0x102b64f3,0x0b03698b,0x267b7e03,0x3d53737b,
+ 0x138af0e4,0x08a2fd9c,0x25daea14,0x3ef2e76c,0x7f2ac504,0x6402c87c,0x497adff4,0x5252d28c,
+ 0xcaca9b24,0xd1e2965c,0xfc9a81d4,0xe7b28cac,0xa66aaec4,0xbd42a3bc,0x903ab434,0x8b12b94c,
+ 0xcd89b30a,0xd6a1be72,0xfbd9a9fa,0xe0f1a482,0xa12986ea,0xba018b92,0x97799c1a,0x8c519162,
+ 0x14c9d8ca,0x0fe1d5b2,0x2299c23a,0x39b1cf42,0x7869ed2a,0x6341e052,0x4e39f7da,0x5511faa2,
+ 0x7bc8793d,0x60e07445,0x4d9863cd,0x56b06eb5,0x17684cdd,0x0c4041a5,0x2138562d,0x3a105b55,
+ 0xa28812fd,0xb9a01f85,0x94d8080d,0x8ff00575,0xce28271d,0xd5002a65,0xf8783ded,0xe3503095,
+ 0x754e2961,0x6e662419,0x431e3391,0x58363ee9,0x19ee1c81,0x02c611f9,0x2fbe0671,0x34960b09,
+ 0xac0e42a1,0xb7264fd9,0x9a5e5851,0x81765529,0xc0ae7741,0xdb867a39,0xf6fe6db1,0xedd660c9,
+ 0xc30fe356,0xd827ee2e,0xf55ff9a6,0xee77f4de,0xafafd6b6,0xb487dbce,0x99ffcc46,0x82d7c13e,
+ 0x1a4f8896,0x016785ee,0x2c1f9266,0x37379f1e,0x76efbd76,0x6dc7b00e,0x40bfa786,0x5b97aafe,
+ 0x1d0ca0b8,0x0624adc0,0x2b5cba48,0x3074b730,0x71ac9558,0x6a849820,0x47fc8fa8,0x5cd482d0,
+ 0xc44ccb78,0xdf64c600,0xf21cd188,0xe934dcf0,0xa8ecfe98,0xb3c4f3e0,0x9ebce468,0x8594e910,
+ 0xab4d6a8f,0xb06567f7,0x9d1d707f,0x86357d07,0xc7ed5f6f,0xdcc55217,0xf1bd459f,0xea9548e7,
+ 0x720d014f,0x69250c37,0x445d1bbf,0x5f7516c7,0x1ead34af,0x058539d7,0x28fd2e5f,0x33d52327},
+
+{0x00000000,0x4f576811,0x9eaed022,0xd1f9b833,0x399cbdf3,0x76cbd5e2,0xa7326dd1,0xe86505c0,
+ 0x73397be6,0x3c6e13f7,0xed97abc4,0xa2c0c3d5,0x4aa5c615,0x05f2ae04,0xd40b1637,0x9b5c7e26,
+ 0xe672f7cc,0xa9259fdd,0x78dc27ee,0x378b4fff,0xdfee4a3f,0x90b9222e,0x41409a1d,0x0e17f20c,
+ 0x954b8c2a,0xda1ce43b,0x0be55c08,0x44b23419,0xacd731d9,0xe38059c8,0x3279e1fb,0x7d2e89ea,
+ 0xc824f22f,0x87739a3e,0x568a220d,0x19dd4a1c,0xf1b84fdc,0xbeef27cd,0x6f169ffe,0x2041f7ef,
+ 0xbb1d89c9,0xf44ae1d8,0x25b359eb,0x6ae431fa,0x8281343a,0xcdd65c2b,0x1c2fe418,0x53788c09,
+ 0x2e5605e3,0x61016df2,0xb0f8d5c1,0xffafbdd0,0x17cab810,0x589dd001,0x89646832,0xc6330023,
+ 0x5d6f7e05,0x12381614,0xc3c1ae27,0x8c96c636,0x64f3c3f6,0x2ba4abe7,0xfa5d13d4,0xb50a7bc5,
+ 0x9488f9e9,0xdbdf91f8,0x0a2629cb,0x457141da,0xad14441a,0xe2432c0b,0x33ba9438,0x7cedfc29,
+ 0xe7b1820f,0xa8e6ea1e,0x791f522d,0x36483a3c,0xde2d3ffc,0x917a57ed,0x4083efde,0x0fd487cf,
+ 0x72fa0e25,0x3dad6634,0xec54de07,0xa303b616,0x4b66b3d6,0x0431dbc7,0xd5c863f4,0x9a9f0be5,
+ 0x01c375c3,0x4e941dd2,0x9f6da5e1,0xd03acdf0,0x385fc830,0x7708a021,0xa6f11812,0xe9a67003,
+ 0x5cac0bc6,0x13fb63d7,0xc202dbe4,0x8d55b3f5,0x6530b635,0x2a67de24,0xfb9e6617,0xb4c90e06,
+ 0x2f957020,0x60c21831,0xb13ba002,0xfe6cc813,0x1609cdd3,0x595ea5c2,0x88a71df1,0xc7f075e0,
+ 0xbadefc0a,0xf589941b,0x24702c28,0x6b274439,0x834241f9,0xcc1529e8,0x1dec91db,0x52bbf9ca,
+ 0xc9e787ec,0x86b0effd,0x574957ce,0x181e3fdf,0xf07b3a1f,0xbf2c520e,0x6ed5ea3d,0x2182822c,
+ 0x2dd0ee65,0x62878674,0xb37e3e47,0xfc295656,0x144c5396,0x5b1b3b87,0x8ae283b4,0xc5b5eba5,
+ 0x5ee99583,0x11befd92,0xc04745a1,0x8f102db0,0x67752870,0x28224061,0xf9dbf852,0xb68c9043,
+ 0xcba219a9,0x84f571b8,0x550cc98b,0x1a5ba19a,0xf23ea45a,0xbd69cc4b,0x6c907478,0x23c71c69,
+ 0xb89b624f,0xf7cc0a5e,0x2635b26d,0x6962da7c,0x8107dfbc,0xce50b7ad,0x1fa90f9e,0x50fe678f,
+ 0xe5f41c4a,0xaaa3745b,0x7b5acc68,0x340da479,0xdc68a1b9,0x933fc9a8,0x42c6719b,0x0d91198a,
+ 0x96cd67ac,0xd99a0fbd,0x0863b78e,0x4734df9f,0xaf51da5f,0xe006b24e,0x31ff0a7d,0x7ea8626c,
+ 0x0386eb86,0x4cd18397,0x9d283ba4,0xd27f53b5,0x3a1a5675,0x754d3e64,0xa4b48657,0xebe3ee46,
+ 0x70bf9060,0x3fe8f871,0xee114042,0xa1462853,0x49232d93,0x06744582,0xd78dfdb1,0x98da95a0,
+ 0xb958178c,0xf60f7f9d,0x27f6c7ae,0x68a1afbf,0x80c4aa7f,0xcf93c26e,0x1e6a7a5d,0x513d124c,
+ 0xca616c6a,0x8536047b,0x54cfbc48,0x1b98d459,0xf3fdd199,0xbcaab988,0x6d5301bb,0x220469aa,
+ 0x5f2ae040,0x107d8851,0xc1843062,0x8ed35873,0x66b65db3,0x29e135a2,0xf8188d91,0xb74fe580,
+ 0x2c139ba6,0x6344f3b7,0xb2bd4b84,0xfdea2395,0x158f2655,0x5ad84e44,0x8b21f677,0xc4769e66,
+ 0x717ce5a3,0x3e2b8db2,0xefd23581,0xa0855d90,0x48e05850,0x07b73041,0xd64e8872,0x9919e063,
+ 0x02459e45,0x4d12f654,0x9ceb4e67,0xd3bc2676,0x3bd923b6,0x748e4ba7,0xa577f394,0xea209b85,
+ 0x970e126f,0xd8597a7e,0x09a0c24d,0x46f7aa5c,0xae92af9c,0xe1c5c78d,0x303c7fbe,0x7f6b17af,
+ 0xe4376989,0xab600198,0x7a99b9ab,0x35ced1ba,0xddabd47a,0x92fcbc6b,0x43050458,0x0c526c49},
+
+{0x00000000,0x5ba1dcca,0xb743b994,0xece2655e,0x6a466e9f,0x31e7b255,0xdd05d70b,0x86a40bc1,
+ 0xd48cdd3e,0x8f2d01f4,0x63cf64aa,0x386eb860,0xbecab3a1,0xe56b6f6b,0x09890a35,0x5228d6ff,
+ 0xadd8a7cb,0xf6797b01,0x1a9b1e5f,0x413ac295,0xc79ec954,0x9c3f159e,0x70dd70c0,0x2b7cac0a,
+ 0x79547af5,0x22f5a63f,0xce17c361,0x95b61fab,0x1312146a,0x48b3c8a0,0xa451adfe,0xfff07134,
+ 0x5f705221,0x04d18eeb,0xe833ebb5,0xb392377f,0x35363cbe,0x6e97e074,0x8275852a,0xd9d459e0,
+ 0x8bfc8f1f,0xd05d53d5,0x3cbf368b,0x671eea41,0xe1bae180,0xba1b3d4a,0x56f95814,0x0d5884de,
+ 0xf2a8f5ea,0xa9092920,0x45eb4c7e,0x1e4a90b4,0x98ee9b75,0xc34f47bf,0x2fad22e1,0x740cfe2b,
+ 0x262428d4,0x7d85f41e,0x91679140,0xcac64d8a,0x4c62464b,0x17c39a81,0xfb21ffdf,0xa0802315,
+ 0xbee0a442,0xe5417888,0x09a31dd6,0x5202c11c,0xd4a6cadd,0x8f071617,0x63e57349,0x3844af83,
+ 0x6a6c797c,0x31cda5b6,0xdd2fc0e8,0x868e1c22,0x002a17e3,0x5b8bcb29,0xb769ae77,0xecc872bd,
+ 0x13380389,0x4899df43,0xa47bba1d,0xffda66d7,0x797e6d16,0x22dfb1dc,0xce3dd482,0x959c0848,
+ 0xc7b4deb7,0x9c15027d,0x70f76723,0x2b56bbe9,0xadf2b028,0xf6536ce2,0x1ab109bc,0x4110d576,
+ 0xe190f663,0xba312aa9,0x56d34ff7,0x0d72933d,0x8bd698fc,0xd0774436,0x3c952168,0x6734fda2,
+ 0x351c2b5d,0x6ebdf797,0x825f92c9,0xd9fe4e03,0x5f5a45c2,0x04fb9908,0xe819fc56,0xb3b8209c,
+ 0x4c4851a8,0x17e98d62,0xfb0be83c,0xa0aa34f6,0x260e3f37,0x7dafe3fd,0x914d86a3,0xcaec5a69,
+ 0x98c48c96,0xc365505c,0x2f873502,0x7426e9c8,0xf282e209,0xa9233ec3,0x45c15b9d,0x1e608757,
+ 0x79005533,0x22a189f9,0xce43eca7,0x95e2306d,0x13463bac,0x48e7e766,0xa4058238,0xffa45ef2,
+ 0xad8c880d,0xf62d54c7,0x1acf3199,0x416eed53,0xc7cae692,0x9c6b3a58,0x70895f06,0x2b2883cc,
+ 0xd4d8f2f8,0x8f792e32,0x639b4b6c,0x383a97a6,0xbe9e9c67,0xe53f40ad,0x09dd25f3,0x527cf939,
+ 0x00542fc6,0x5bf5f30c,0xb7179652,0xecb64a98,0x6a124159,0x31b39d93,0xdd51f8cd,0x86f02407,
+ 0x26700712,0x7dd1dbd8,0x9133be86,0xca92624c,0x4c36698d,0x1797b547,0xfb75d019,0xa0d40cd3,
+ 0xf2fcda2c,0xa95d06e6,0x45bf63b8,0x1e1ebf72,0x98bab4b3,0xc31b6879,0x2ff90d27,0x7458d1ed,
+ 0x8ba8a0d9,0xd0097c13,0x3ceb194d,0x674ac587,0xe1eece46,0xba4f128c,0x56ad77d2,0x0d0cab18,
+ 0x5f247de7,0x0485a12d,0xe867c473,0xb3c618b9,0x35621378,0x6ec3cfb2,0x8221aaec,0xd9807626,
+ 0xc7e0f171,0x9c412dbb,0x70a348e5,0x2b02942f,0xada69fee,0xf6074324,0x1ae5267a,0x4144fab0,
+ 0x136c2c4f,0x48cdf085,0xa42f95db,0xff8e4911,0x792a42d0,0x228b9e1a,0xce69fb44,0x95c8278e,
+ 0x6a3856ba,0x31998a70,0xdd7bef2e,0x86da33e4,0x007e3825,0x5bdfe4ef,0xb73d81b1,0xec9c5d7b,
+ 0xbeb48b84,0xe515574e,0x09f73210,0x5256eeda,0xd4f2e51b,0x8f5339d1,0x63b15c8f,0x38108045,
+ 0x9890a350,0xc3317f9a,0x2fd31ac4,0x7472c60e,0xf2d6cdcf,0xa9771105,0x4595745b,0x1e34a891,
+ 0x4c1c7e6e,0x17bda2a4,0xfb5fc7fa,0xa0fe1b30,0x265a10f1,0x7dfbcc3b,0x9119a965,0xcab875af,
+ 0x3548049b,0x6ee9d851,0x820bbd0f,0xd9aa61c5,0x5f0e6a04,0x04afb6ce,0xe84dd390,0xb3ec0f5a,
+ 0xe1c4d9a5,0xba65056f,0x56876031,0x0d26bcfb,0x8b82b73a,0xd0236bf0,0x3cc10eae,0x6760d264}};
diff --git a/media/libogg/include/ogg/config_types.h b/media/libogg/include/ogg/config_types.h
index 1e7d490989..1a87df6423 100644
--- a/media/libogg/include/ogg/config_types.h
+++ b/media/libogg/include/ogg/config_types.h
@@ -1,7 +1,7 @@
#ifndef __CONFIG_TYPES_H__
#define __CONFIG_TYPES_H__
-/* these are filled in by configure */
+/* these are filled in by configure or cmake*/
#define INCLUDE_INTTYPES_H 1
#define INCLUDE_STDINT_H 1
#define INCLUDE_SYS_TYPES_H 1
@@ -16,10 +16,11 @@
# include <sys/types.h>
#endif
-typedef short ogg_int16_t;
-typedef unsigned short ogg_uint16_t;
-typedef int ogg_int32_t;
-typedef unsigned int ogg_uint32_t;
-typedef long long ogg_int64_t;
+typedef int16_t ogg_int16_t;
+typedef uint16_t ogg_uint16_t;
+typedef int32_t ogg_int32_t;
+typedef uint32_t ogg_uint32_t;
+typedef int64_t ogg_int64_t;
+typedef uint64_t ogg_uint64_t;
#endif
diff --git a/media/libogg/include/ogg/ogg.h b/media/libogg/include/ogg/ogg.h
index cebe38ee1e..2789fc1ca2 100644
--- a/media/libogg/include/ogg/ogg.h
+++ b/media/libogg/include/ogg/ogg.h
@@ -11,7 +11,6 @@
********************************************************************
function: toplevel libogg include
- last mod: $Id: ogg.h 18044 2011-08-01 17:55:20Z gmaxwell $
********************************************************************/
#ifndef _OGG_H
diff --git a/media/libogg/include/ogg/os_types.h b/media/libogg/include/ogg/os_types.h
index 62d717b684..a8d4600fb0 100644
--- a/media/libogg/include/ogg/os_types.h
+++ b/media/libogg/include/ogg/os_types.h
@@ -10,8 +10,7 @@
* *
********************************************************************
- function: #ifdef jail to whip a few platforms into the UNIX ideal.
- last mod: $Id: os_types.h 19098 2014-02-26 19:06:45Z giles $
+ function: Define a consistent set of types on each platform.
********************************************************************/
#ifndef _OS_TYPES_H
@@ -65,36 +64,40 @@ extern ogg_free_function_type *ogg_free_func;
typedef unsigned long long ogg_uint64_t;
# elif defined(__MWERKS__)
typedef long long ogg_int64_t;
+ typedef unsigned long long ogg_uint64_t;
typedef int ogg_int32_t;
typedef unsigned int ogg_uint32_t;
typedef short ogg_int16_t;
typedef unsigned short ogg_uint16_t;
# else
- /* MSVC/Borland */
- typedef __int64 ogg_int64_t;
- typedef __int32 ogg_int32_t;
- typedef unsigned __int32 ogg_uint32_t;
- typedef __int16 ogg_int16_t;
- typedef unsigned __int16 ogg_uint16_t;
+# if defined(_MSC_VER) && (_MSC_VER >= 1800) /* MSVC 2013 and newer */
+# include <stdint.h>
+ typedef int16_t ogg_int16_t;
+ typedef uint16_t ogg_uint16_t;
+ typedef int32_t ogg_int32_t;
+ typedef uint32_t ogg_uint32_t;
+ typedef int64_t ogg_int64_t;
+ typedef uint64_t ogg_uint64_t;
+# else
+ /* MSVC/Borland */
+ typedef __int64 ogg_int64_t;
+ typedef __int32 ogg_int32_t;
+ typedef unsigned __int32 ogg_uint32_t;
+ typedef unsigned __int64 ogg_uint64_t;
+ typedef __int16 ogg_int16_t;
+ typedef unsigned __int16 ogg_uint16_t;
+# endif
# endif
-#elif defined(__MACOS__)
-
-# include <sys/types.h>
- typedef SInt16 ogg_int16_t;
- typedef UInt16 ogg_uint16_t;
- typedef SInt32 ogg_int32_t;
- typedef UInt32 ogg_uint32_t;
- typedef SInt64 ogg_int64_t;
-
#elif (defined(__APPLE__) && defined(__MACH__)) /* MacOS X Framework build */
-# include <inttypes.h>
+# include <sys/types.h>
typedef int16_t ogg_int16_t;
- typedef uint16_t ogg_uint16_t;
+ typedef u_int16_t ogg_uint16_t;
typedef int32_t ogg_int32_t;
- typedef uint32_t ogg_uint32_t;
+ typedef u_int32_t ogg_uint32_t;
typedef int64_t ogg_int64_t;
+ typedef u_int64_t ogg_uint64_t;
#elif defined(__sun__)
@@ -115,6 +118,7 @@ extern ogg_free_function_type *ogg_free_func;
typedef int ogg_int32_t;
typedef unsigned int ogg_uint32_t;
typedef long long ogg_int64_t;
+ typedef unsigned long long ogg_uint64_t;
#elif defined(__BEOS__)
@@ -125,6 +129,7 @@ extern ogg_free_function_type *ogg_free_func;
typedef int32_t ogg_int32_t;
typedef uint32_t ogg_uint32_t;
typedef int64_t ogg_int64_t;
+ typedef uint64_t ogg_uint64_t;
#elif defined (__EMX__)
@@ -134,6 +139,8 @@ extern ogg_free_function_type *ogg_free_func;
typedef int ogg_int32_t;
typedef unsigned int ogg_uint32_t;
typedef long long ogg_int64_t;
+ typedef unsigned long long ogg_uint64_t;
+
#elif defined (DJGPP)
@@ -142,11 +149,13 @@ extern ogg_free_function_type *ogg_free_func;
typedef int ogg_int32_t;
typedef unsigned int ogg_uint32_t;
typedef long long ogg_int64_t;
+ typedef unsigned long long ogg_uint64_t;
#elif defined(R5900)
/* PS2 EE */
typedef long ogg_int64_t;
+ typedef unsigned long ogg_uint64_t;
typedef int ogg_int32_t;
typedef unsigned ogg_uint32_t;
typedef short ogg_int16_t;
@@ -159,6 +168,7 @@ extern ogg_free_function_type *ogg_free_func;
typedef signed int ogg_int32_t;
typedef unsigned int ogg_uint32_t;
typedef long long int ogg_int64_t;
+ typedef unsigned long long int ogg_uint64_t;
#elif defined(__TMS320C6X__)
@@ -168,6 +178,7 @@ extern ogg_free_function_type *ogg_free_func;
typedef signed int ogg_int32_t;
typedef unsigned int ogg_uint32_t;
typedef long long int ogg_int64_t;
+ typedef unsigned long long int ogg_uint64_t;
#else
diff --git a/media/libogg/moz.build b/media/libogg/moz.build
index abc8d02850..839d2b2aab 100644
--- a/media/libogg/moz.build
+++ b/media/libogg/moz.build
@@ -6,6 +6,10 @@
with Files('*'):
BUG_COMPONENT = ('Core', 'Video/Audio')
+EXPORTS += [
+ 'include/crctable.h',
+]
+
EXPORTS.ogg += [
'include/ogg/config_types.h',
'include/ogg/ogg.h',
diff --git a/media/libogg/src/ogg_bitwise.c b/media/libogg/src/ogg_bitwise.c
index 145901d185..f5ef79122e 100644
--- a/media/libogg/src/ogg_bitwise.c
+++ b/media/libogg/src/ogg_bitwise.c
@@ -11,7 +11,6 @@
********************************************************************
function: packing variable sized words into an octet stream
- last mod: $Id: bitwise.c 19149 2014-05-27 16:26:23Z giles $
********************************************************************/
@@ -890,7 +889,7 @@ int main(void){
for(i=0;i<test2size;i++){
if(oggpack_look(&r,32)==-1)report("out of data. failed!");
if(oggpack_look(&r,32)!=large[i]){
- fprintf(stderr,"%ld != %ld (%lx!=%lx):",oggpack_look(&r,32),large[i],
+ fprintf(stderr,"%ld != %lu (%lx!=%lx):",oggpack_look(&r,32),large[i],
oggpack_look(&r,32),large[i]);
report("read incorrect value!\n");
}
@@ -1000,7 +999,7 @@ int main(void){
for(i=0;i<test2size;i++){
if(oggpackB_look(&r,32)==-1)report("out of data. failed!");
if(oggpackB_look(&r,32)!=large[i]){
- fprintf(stderr,"%ld != %ld (%lx!=%lx):",oggpackB_look(&r,32),large[i],
+ fprintf(stderr,"%ld != %lu (%lx!=%lx):",oggpackB_look(&r,32),large[i],
oggpackB_look(&r,32),large[i]);
report("read incorrect value!\n");
}
diff --git a/media/libogg/src/ogg_framing.c b/media/libogg/src/ogg_framing.c
index 3a2f0a6058..724d116d7f 100644
--- a/media/libogg/src/ogg_framing.c
+++ b/media/libogg/src/ogg_framing.c
@@ -5,14 +5,13 @@
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
- * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010 *
+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2018 *
* by the Xiph.Org Foundation http://www.xiph.org/ *
* *
********************************************************************
function: code raw packets into framed OggSquish stream and
decode Ogg streams back into raw packets
- last mod: $Id: framing.c 18758 2013-01-08 16:29:56Z tterribe $
note: The CRC code is directly derived from public domain code by
Ross Williams (ross@guest.adelaide.edu.au). See docs/framing.html
@@ -20,6 +19,10 @@
********************************************************************/
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
#include <stdlib.h>
#include <limits.h>
#include <string.h>
@@ -45,7 +48,7 @@ int ogg_page_eos(const ogg_page *og){
ogg_int64_t ogg_page_granulepos(const ogg_page *og){
unsigned char *page=og->header;
- ogg_int64_t granulepos=page[13]&(0xff);
+ ogg_uint64_t granulepos=page[13]&(0xff);
granulepos= (granulepos<<8)|(page[12]&0xff);
granulepos= (granulepos<<8)|(page[11]&0xff);
granulepos= (granulepos<<8)|(page[10]&0xff);
@@ -53,21 +56,21 @@ ogg_int64_t ogg_page_granulepos(const ogg_page *og){
granulepos= (granulepos<<8)|(page[8]&0xff);
granulepos= (granulepos<<8)|(page[7]&0xff);
granulepos= (granulepos<<8)|(page[6]&0xff);
- return(granulepos);
+ return((ogg_int64_t)granulepos);
}
int ogg_page_serialno(const ogg_page *og){
- return(og->header[14] |
- (og->header[15]<<8) |
- (og->header[16]<<16) |
- (og->header[17]<<24));
+ return((int)((ogg_uint32_t)og->header[14]) |
+ ((ogg_uint32_t)og->header[15]<<8) |
+ ((ogg_uint32_t)og->header[16]<<16) |
+ ((ogg_uint32_t)og->header[17]<<24));
}
long ogg_page_pageno(const ogg_page *og){
- return(og->header[18] |
- (og->header[19]<<8) |
- (og->header[20]<<16) |
- (og->header[21]<<24));
+ return((long)((ogg_uint32_t)og->header[18]) |
+ ((ogg_uint32_t)og->header[19]<<8) |
+ ((ogg_uint32_t)og->header[20]<<16) |
+ ((ogg_uint32_t)og->header[21]<<24));
}
@@ -99,90 +102,31 @@ int ogg_page_packets(const ogg_page *og){
#if 0
/* helper to initialize lookup for direct-table CRC (illustrative; we
- use the static init below) */
-
-static ogg_uint32_t _ogg_crc_entry(unsigned long index){
- int i;
- unsigned long r;
-
- r = index << 24;
- for (i=0; i<8; i++)
- if (r & 0x80000000UL)
- r = (r << 1) ^ 0x04c11db7; /* The same as the ethernet generator
- polynomial, although we use an
- unreflected alg and an init/final
- of 0, not 0xffffffff */
- else
- r<<=1;
- return (r & 0xffffffffUL);
+ use the static init in crctable.h) */
+
+static void _ogg_crc_init(){
+ int i, j;
+ ogg_uint32_t polynomial, crc;
+ polynomial = 0x04c11db7; /* The same as the ethernet generator
+ polynomial, although we use an
+ unreflected alg and an init/final
+ of 0, not 0xffffffff */
+ for (i = 0; i <= 0xFF; i++){
+ crc = i << 24;
+
+ for (j = 0; j < 8; j++)
+ crc = (crc << 1) ^ (crc & (1 << 31) ? polynomial : 0);
+
+ crc_lookup[0][i] = crc;
+ }
+
+ for (i = 0; i <= 0xFF; i++)
+ for (j = 1; j < 8; j++)
+ crc_lookup[j][i] = crc_lookup[0][(crc_lookup[j - 1][i] >> 24) & 0xFF] ^ (crc_lookup[j - 1][i] << 8);
}
#endif
-static const ogg_uint32_t crc_lookup[256]={
- 0x00000000,0x04c11db7,0x09823b6e,0x0d4326d9,
- 0x130476dc,0x17c56b6b,0x1a864db2,0x1e475005,
- 0x2608edb8,0x22c9f00f,0x2f8ad6d6,0x2b4bcb61,
- 0x350c9b64,0x31cd86d3,0x3c8ea00a,0x384fbdbd,
- 0x4c11db70,0x48d0c6c7,0x4593e01e,0x4152fda9,
- 0x5f15adac,0x5bd4b01b,0x569796c2,0x52568b75,
- 0x6a1936c8,0x6ed82b7f,0x639b0da6,0x675a1011,
- 0x791d4014,0x7ddc5da3,0x709f7b7a,0x745e66cd,
- 0x9823b6e0,0x9ce2ab57,0x91a18d8e,0x95609039,
- 0x8b27c03c,0x8fe6dd8b,0x82a5fb52,0x8664e6e5,
- 0xbe2b5b58,0xbaea46ef,0xb7a96036,0xb3687d81,
- 0xad2f2d84,0xa9ee3033,0xa4ad16ea,0xa06c0b5d,
- 0xd4326d90,0xd0f37027,0xddb056fe,0xd9714b49,
- 0xc7361b4c,0xc3f706fb,0xceb42022,0xca753d95,
- 0xf23a8028,0xf6fb9d9f,0xfbb8bb46,0xff79a6f1,
- 0xe13ef6f4,0xe5ffeb43,0xe8bccd9a,0xec7dd02d,
- 0x34867077,0x30476dc0,0x3d044b19,0x39c556ae,
- 0x278206ab,0x23431b1c,0x2e003dc5,0x2ac12072,
- 0x128e9dcf,0x164f8078,0x1b0ca6a1,0x1fcdbb16,
- 0x018aeb13,0x054bf6a4,0x0808d07d,0x0cc9cdca,
- 0x7897ab07,0x7c56b6b0,0x71159069,0x75d48dde,
- 0x6b93dddb,0x6f52c06c,0x6211e6b5,0x66d0fb02,
- 0x5e9f46bf,0x5a5e5b08,0x571d7dd1,0x53dc6066,
- 0x4d9b3063,0x495a2dd4,0x44190b0d,0x40d816ba,
- 0xaca5c697,0xa864db20,0xa527fdf9,0xa1e6e04e,
- 0xbfa1b04b,0xbb60adfc,0xb6238b25,0xb2e29692,
- 0x8aad2b2f,0x8e6c3698,0x832f1041,0x87ee0df6,
- 0x99a95df3,0x9d684044,0x902b669d,0x94ea7b2a,
- 0xe0b41de7,0xe4750050,0xe9362689,0xedf73b3e,
- 0xf3b06b3b,0xf771768c,0xfa325055,0xfef34de2,
- 0xc6bcf05f,0xc27dede8,0xcf3ecb31,0xcbffd686,
- 0xd5b88683,0xd1799b34,0xdc3abded,0xd8fba05a,
- 0x690ce0ee,0x6dcdfd59,0x608edb80,0x644fc637,
- 0x7a089632,0x7ec98b85,0x738aad5c,0x774bb0eb,
- 0x4f040d56,0x4bc510e1,0x46863638,0x42472b8f,
- 0x5c007b8a,0x58c1663d,0x558240e4,0x51435d53,
- 0x251d3b9e,0x21dc2629,0x2c9f00f0,0x285e1d47,
- 0x36194d42,0x32d850f5,0x3f9b762c,0x3b5a6b9b,
- 0x0315d626,0x07d4cb91,0x0a97ed48,0x0e56f0ff,
- 0x1011a0fa,0x14d0bd4d,0x19939b94,0x1d528623,
- 0xf12f560e,0xf5ee4bb9,0xf8ad6d60,0xfc6c70d7,
- 0xe22b20d2,0xe6ea3d65,0xeba91bbc,0xef68060b,
- 0xd727bbb6,0xd3e6a601,0xdea580d8,0xda649d6f,
- 0xc423cd6a,0xc0e2d0dd,0xcda1f604,0xc960ebb3,
- 0xbd3e8d7e,0xb9ff90c9,0xb4bcb610,0xb07daba7,
- 0xae3afba2,0xaafbe615,0xa7b8c0cc,0xa379dd7b,
- 0x9b3660c6,0x9ff77d71,0x92b45ba8,0x9675461f,
- 0x8832161a,0x8cf30bad,0x81b02d74,0x857130c3,
- 0x5d8a9099,0x594b8d2e,0x5408abf7,0x50c9b640,
- 0x4e8ee645,0x4a4ffbf2,0x470cdd2b,0x43cdc09c,
- 0x7b827d21,0x7f436096,0x7200464f,0x76c15bf8,
- 0x68860bfd,0x6c47164a,0x61043093,0x65c52d24,
- 0x119b4be9,0x155a565e,0x18197087,0x1cd86d30,
- 0x029f3d35,0x065e2082,0x0b1d065b,0x0fdc1bec,
- 0x3793a651,0x3352bbe6,0x3e119d3f,0x3ad08088,
- 0x2497d08d,0x2056cd3a,0x2d15ebe3,0x29d4f654,
- 0xc5a92679,0xc1683bce,0xcc2b1d17,0xc8ea00a0,
- 0xd6ad50a5,0xd26c4d12,0xdf2f6bcb,0xdbee767c,
- 0xe3a1cbc1,0xe760d676,0xea23f0af,0xeee2ed18,
- 0xf0a5bd1d,0xf464a0aa,0xf9278673,0xfde69bc4,
- 0x89b8fd09,0x8d79e0be,0x803ac667,0x84fbdbd0,
- 0x9abc8bd5,0x9e7d9662,0x933eb0bb,0x97ffad0c,
- 0xafb010b1,0xab710d06,0xa6322bdf,0xa2f33668,
- 0xbcb4666d,0xb8757bda,0xb5365d03,0xb1f740b4};
+#include "crctable.h"
/* init the encode/decode logical stream state */
@@ -290,10 +234,27 @@ static int _os_lacing_expand(ogg_stream_state *os,long needed){
/* Direct table CRC; note that this will be faster in the future if we
perform the checksum simultaneously with other copies */
+static ogg_uint32_t _os_update_crc(ogg_uint32_t crc, unsigned char *buffer, int size){
+ while (size>=8){
+ crc^=((ogg_uint32_t)buffer[0]<<24)|((ogg_uint32_t)buffer[1]<<16)|((ogg_uint32_t)buffer[2]<<8)|((ogg_uint32_t)buffer[3]);
+
+ crc=crc_lookup[7][ crc>>24 ]^crc_lookup[6][(crc>>16)&0xFF]^
+ crc_lookup[5][(crc>> 8)&0xFF]^crc_lookup[4][ crc &0xFF]^
+ crc_lookup[3][buffer[4] ]^crc_lookup[2][buffer[5] ]^
+ crc_lookup[1][buffer[6] ]^crc_lookup[0][buffer[7] ];
+
+ buffer+=8;
+ size-=8;
+ }
+
+ while (size--)
+ crc=(crc<<8)^crc_lookup[0][((crc >> 24)&0xff)^*buffer++];
+ return crc;
+}
+
void ogg_page_checksum_set(ogg_page *og){
if(og){
ogg_uint32_t crc_reg=0;
- int i;
/* safety; needed for API behavior, but not framing code */
og->header[22]=0;
@@ -301,10 +262,8 @@ void ogg_page_checksum_set(ogg_page *og){
og->header[24]=0;
og->header[25]=0;
- for(i=0;i<og->header_len;i++)
- crc_reg=(crc_reg<<8)^crc_lookup[((crc_reg >> 24)&0xff)^og->header[i]];
- for(i=0;i<og->body_len;i++)
- crc_reg=(crc_reg<<8)^crc_lookup[((crc_reg >> 24)&0xff)^og->body[i]];
+ crc_reg=_os_update_crc(crc_reg,og->header,og->header_len);
+ crc_reg=_os_update_crc(crc_reg,og->body,og->body_len);
og->header[22]=(unsigned char)(crc_reg&0xff);
og->header[23]=(unsigned char)((crc_reg>>8)&0xff);
@@ -414,9 +373,9 @@ static int ogg_stream_flush_i(ogg_stream_state *os,ogg_page *og, int force, int
}else{
/* The extra packets_done, packet_just_done logic here attempts to do two things:
- 1) Don't unneccessarily span pages.
+ 1) Don't unnecessarily span pages.
2) Unless necessary, don't flush pages if there are less than four packets on
- them; this expands page size to reduce unneccessary overhead if incoming packets
+ them; this expands page size to reduce unnecessary overhead if incoming packets
are large.
These are not necessary behaviors, just 'always better than naive flushing'
without requiring an application to explicitly request a specific optimized
@@ -638,9 +597,14 @@ char *ogg_sync_buffer(ogg_sync_state *oy, long size){
if(size>oy->storage-oy->fill){
/* We need to extend the internal buffer */
- long newsize=size+oy->fill+4096; /* an extra page to be nice */
+ long newsize;
void *ret;
+ if(size>INT_MAX-4096-oy->fill){
+ ogg_sync_clear(oy);
+ return NULL;
+ }
+ newsize=size+oy->fill+4096; /* an extra page to be nice */
if(oy->data)
ret=_ogg_realloc(oy->data,newsize);
else
@@ -723,16 +687,15 @@ long ogg_sync_pageseek(ogg_sync_state *oy,ogg_page *og){
/* replace the computed checksum with the one actually read in */
memcpy(page+22,chksum,4);
+#ifndef DISABLE_CRC
/* Bad checksum. Lose sync */
goto sync_fail;
+#endif
}
}
/* yes, have a whole page all ready to go */
{
- unsigned char *page=oy->data+oy->returned;
- long bytes;
-
if(og){
og->header=page;
og->header_len=oy->headerbytes;
@@ -875,6 +838,7 @@ int ogg_stream_pagein(ogg_stream_state *os, ogg_page *og){
some segments */
if(continued){
if(os->lacing_fill<1 ||
+ (os->lacing_vals[os->lacing_fill-1]&0xff)<255 ||
os->lacing_vals[os->lacing_fill-1]==0x400){
bos=0;
for(;segptr<segments;segptr++){
@@ -1492,6 +1456,34 @@ const int head3_7[] = {0x4f,0x67,0x67,0x53,0,0x05,
1,
0};
+int compare_packet(const ogg_packet *op1, const ogg_packet *op2){
+ if(op1->packet!=op2->packet){
+ fprintf(stderr,"op1->packet != op2->packet\n");
+ return(1);
+ }
+ if(op1->bytes!=op2->bytes){
+ fprintf(stderr,"op1->bytes != op2->bytes\n");
+ return(1);
+ }
+ if(op1->b_o_s!=op2->b_o_s){
+ fprintf(stderr,"op1->b_o_s != op2->b_o_s\n");
+ return(1);
+ }
+ if(op1->e_o_s!=op2->e_o_s){
+ fprintf(stderr,"op1->e_o_s != op2->e_o_s\n");
+ return(1);
+ }
+ if(op1->granulepos!=op2->granulepos){
+ fprintf(stderr,"op1->granulepos != op2->granulepos\n");
+ return(1);
+ }
+ if(op1->packetno!=op2->packetno){
+ fprintf(stderr,"op1->packetno != op2->packetno\n");
+ return(1);
+ }
+ return(0);
+}
+
void test_pack(const int *pl, const int **headers, int byteskip,
int pageskip, int packetskip){
unsigned char *data=_ogg_malloc(1024*1024); /* for scripted test cases only */
@@ -1577,7 +1569,7 @@ void test_pack(const int *pl, const int **headers, int byteskip,
byteskipcount=byteskip;
}
- ogg_sync_wrote(&oy,next-buf);
+ ogg_sync_wrote(&oy,(long)(next-buf));
while(1){
int ret=ogg_sync_pageout(&oy,&og_de);
@@ -1600,7 +1592,7 @@ void test_pack(const int *pl, const int **headers, int byteskip,
ogg_stream_packetout(&os_de,&op_de); /* just catching them all */
/* verify peek and out match */
- if(memcmp(&op_de,&op_de2,sizeof(op_de))){
+ if(compare_packet(&op_de,&op_de2)){
fprintf(stderr,"packetout != packetpeek! pos=%ld\n",
depacket);
exit(1);
@@ -1785,6 +1777,7 @@ int main(void){
test_pack(packets,headret,0,0,0);
}
+#ifndef DISABLE_CRC
{
/* test for the libogg 1.1.1 resync in large continuation bug
found by Josh Coalson) */
@@ -1794,6 +1787,9 @@ int main(void){
fprintf(stderr,"testing continuation resync in very large packets... ");
test_pack(packets,headret,100,2,3);
}
+#else
+ fprintf(stderr,"Skipping continuation resync test due to --disable-crc\n");
+#endif
{
/* term only page. why not? */
@@ -2055,6 +2051,7 @@ int main(void){
fprintf(stderr,"ok.\n");
}
+#ifndef DISABLE_CRC
/* Test recapture: page + garbage + page */
{
ogg_page og_de;
@@ -2096,6 +2093,9 @@ int main(void){
fprintf(stderr,"ok.\n");
}
+#else
+ fprintf(stderr,"Skipping recapture test due to --disable-crc\n");
+#endif
/* Free page data that was previously copied */
{
@@ -2104,6 +2104,9 @@ int main(void){
}
}
}
+ ogg_sync_clear(&oy);
+ ogg_stream_clear(&os_en);
+ ogg_stream_clear(&os_de);
return(0);
}
diff --git a/media/libogg/update.sh b/media/libogg/update.sh
index 5a7184bcbd..5e12863a03 100644..100755
--- a/media/libogg/update.sh
+++ b/media/libogg/update.sh
@@ -2,12 +2,16 @@
#
# Copies the needed files from a directory containing the original
# libogg source that we need for the Mozilla HTML5 media support.
+#
+# Before executing this script, make sure you've already ran ./configure
+# on the libogg source to ensure config_types.h exists.
+cp $1/src/crctable.h ./include/crctable.h
cp $1/include/ogg/config_types.h ./include/ogg/config_types.h
cp $1/include/ogg/ogg.h ./include/ogg/ogg.h
cp $1/include/ogg/os_types.h ./include/ogg/os_types.h
cp $1/CHANGES ./CHANGES
cp $1/COPYING ./COPYING
-cp $1/README ./README
+cp $1/README.md ./README.md
cp $1/src/bitwise.c ./src/ogg_bitwise.c
cp $1/src/framing.c ./src/ogg_framing.c
cp $1/AUTHORS ./AUTHORS
diff --git a/media/libvorbis/CHANGES b/media/libvorbis/CHANGES
new file mode 100644
index 0000000000..ba0c3ca01a
--- /dev/null
+++ b/media/libvorbis/CHANGES
@@ -0,0 +1,185 @@
+libvorbis 1.3.7 (2020-07-04) -- "Xiph.Org libVorbis I 20200704 (Reducing Environment)"
+
+* Fix CVE-2018-10393 - out-of-bounds read encoding very low sample rates.
+* Fix CVE-2017-14160 - out-of-bounds read encoding very low sample rates.
+* Fix CVE-2018-10392 - out-of-bounds access encoding invalid channel count.
+* Fix handling invalid bytes per sample arguments.
+* Fix handling invalid channel count arguments.
+* Fix invalid free on seek failure.
+* Fix negative shift reading blocksize.
+* Fix accepting unreasonable float32 values.
+* Fix tag comparison depending on locale.
+* Fix unnecessarily linking libm.
+* Fix memory leak in test_sharedbook.
+* Update Visual Studio projects for ogg library filename change.
+* Distribute CMake build files with the source package.
+* Remove unnecessary configure --target switch.
+* Add gitlab CI support.
+* Add OSS-Fuzz support.
+* Build system and integration updates.
+
+libvorbis 1.3.6 (2018-03-16) -- "Xiph.Org libVorbis I 20180316 (Now 100% fewer shells)"
+
+* Fix CVE-2018-5146 - out-of-bounds write on codebook decoding.
+* Fix CVE-2017-14632 - free() on unitialized data
+* Fix CVE-2017-14633 - out-of-bounds read
+* Fix bitrate metadata parsing.
+* Fix out-of-bounds read in codebook parsing.
+* Fix residue vector size in Vorbis I spec.
+* Appveyor support
+* Travis CI support
+* Add secondary CMake build system.
+* Build system fixes
+
+libvorbis 1.3.5 (2015-03-03) -- "Xiph.Org libVorbis I 20150105 (⛄⛄⛄⛄)"
+
+* Tolerate single-entry codebooks.
+* Fix decoder crash with invalid input.
+* Fix encoder crash with non-positive sample rates.
+# Fix issues in vorbisfile's seek bisection code.
+* Spec errata.
+* Reject multiple headers of the same type.
+* Various build fixes and code cleanup.
+
+libvorbis 1.3.4 (2014-01-22) -- "Xiph.Org libVorbis I 20140122 (Turpakäräjiin)"
+
+* Reduce codebook footprint in library code.
+* Various build and documentation fixes.
+
+libvorbis 1.3.3 (2012-02-03) -- "Xiph.Org libVorbis I 20120203 (Omnipresent)"
+
+* vorbis: additional proofing against invalid/malicious
+ streams in decode (see SVN for details).
+* vorbis: fix a memory leak in vorbis_commentheader_out().
+* updates, corrections and clarifications in the Vorbis I specification
+ document
+* win32: fixed project configuration which referenced two CRT versions
+ in output binaries.
+* build warning fixes
+
+libvorbis 1.3.2 (2010-11-01) -- "Xiph.Org libVorbis I 20101101 (Schaufenugget)"
+
+ * vorbis: additional proofing against invalid/malicious
+ streams in floor, residue, and bos/eos packet trimming
+ code (see SVN for details).
+ * vorbis: Added programming documentation tree for the
+ low-level calls
+ * vorbisfile: Correct handling of serial numbers array
+ element [0] on non-seekable streams
+ * vorbisenc: Back out an [old] AoTuV HF weighting that was
+ first enabled in 1.3.0; there are a few samples where I
+ really don't like the effect it causes.
+ * vorbis: return correct timestamp for granule positions
+ with high bit set.
+ * vorbisfile: the [undocumented] half-rate decode api made no
+ attempt to keep the pcm offset tracking consistent in seeks.
+ Fix and add a testing mode to seeking_example.c to torture
+ test seeking in halfrate mode. Also remove requirement that
+ halfrate mode only work with seekable files.
+ * vorbisfile: Fix a chaining bug in raw_seeks where seeking
+ out of the current link would fail due to not
+ reinitializing the decode machinery.
+ * vorbisfile: improve seeking strategy. Reduces the
+ necessary number of seek callbacks in an open or seek
+ operation by well over 2/3.
+
+libvorbis 1.3.1 (2010-02-26) -- "Xiph.Org libVorbis I 20100325 (Everywhere)"
+
+ * tweak + minor arithmetic fix in floor1 fit
+ * revert noise norm to conservative 1.2.3 behavior pending
+ more listening testing
+
+libvorbis 1.3.0 (2010-02-25) -- unreleased staging snapshot
+
+ * Optimized surround support for 5.1 encoding at 44.1/48kHz
+ * Added encoder control call to disable channel coupling
+ * Correct an overflow bug in very low-bitrate encoding on 32 bit
+ machines that caused inflated bitrates
+ * Numerous API hardening, leak and build fixes
+ * Correct bug in 22kHz compand setup that could cause a crash
+ * Correct bug in 16kHz codebooks that could cause unstable pure
+ tones at high bitrates
+
+libvorbis 1.2.3 (2009-07-09) -- "Xiph.Org libVorbis I 20090709"
+
+ * correct a vorbisfile bug that prevented proper playback of
+ Vorbis files where all audio in a logical stream is in a
+ single page
+ * Additional decode setup hardening against malicious streams
+ * Add 'OV_EXCLUDE_STATIC_CALLBACKS' define for developers who
+ wish to avoid unused symbol warnings from the static callbacks
+ defined in vorbisfile.h
+
+libvorbis 1.2.2 (2009-06-24) -- "Xiph.Org libVorbis I 20090624"
+
+ * define VENDOR and ENCODER strings
+ * seek correctly in files bigger than 2 GB (Windows)
+ * fix regression from CVE-2008-1420; 1.0b1 files work again
+ * mark all tables as constant to reduce memory occupation
+ * additional decoder hardening against malicious streams
+ * substantially reduce amount of seeking performed by Vorbisfile
+ * Multichannel decode bugfix
+ * build system updates
+ * minor specification clarifications/fixes
+
+libvorbis 1.2.1 (unreleased) -- "Xiph.Org libVorbis I 20080501"
+
+ * Improved robustness with corrupt streams.
+ * New ov_read_filter() vorbisfile call allows filtering decoded
+ audio as floats before converting to integer samples.
+ * Fix an encoder bug with multichannel streams.
+ * Replaced RTP payload format draft with RFC 5215.
+ * Bare bones self test under 'make check'.
+ * Fix a problem encoding some streams between 14 and 28 kHz.
+ * Fix a numerical instability in the edge extrapolation filter.
+ * Build system improvements.
+ * Specification correction.
+
+libvorbis 1.2.0 (2007-07-25) -- "Xiph.Org libVorbis I 20070622"
+
+ * new ov_fopen() convenience call that avoids the common
+ stdio conflicts with ov_open() and MSVC runtimes.
+ * libvorbisfile now handles multiplexed streams
+ * improve robustness to corrupt input streams
+ * fix a minor encoder bug
+ * updated RTP draft
+ * build system updates
+ * minor corrections to the specification
+
+libvorbis 1.1.2 (2005-11-27) -- "Xiph.Org libVorbis I 20050304"
+
+ * fix a serious encoder bug with gcc 4 optimized builds
+ * documentation and spec fixes
+ * updated VS2003 and XCode builds
+ * new draft RTP encapsulation spec
+
+libvorbis 1.1.1 (2005-06-27) -- "Xiph.Org libVorbis I 20050304"
+
+ * bug fix to the bitrate management encoder interface
+ * bug fix to properly set packetno field in the encoder
+ * new draft RTP encapsulation spec
+ * library API documentation improvements
+
+libvorbis 1.1.0 (2004-09-22) -- "Xiph.Org libVorbis I 20040629"
+
+ * merges tuning improvements from Aoyumi's aoTuV with fixups
+ * new managed bitrate (CBR) mode support
+ * new vorbis_encoder_ctl() interface
+ * extensive documentation updates
+ * application/ogg mimetype is now official
+ * autotools cleanup from Thomas Vander Stichele
+ * SymbianOS build support from Colin Ward at CSIRO
+ * various bugfixes
+ * various packaging improvements
+
+libvorbis 1.0.1 (2003-11-17) -- "Xiph.Org libVorbis I 20030909"
+
+ * numerous bug fixes
+ * specification corrections
+ * new crosslap and halfrate APIs for game use
+ * packaging and build updates
+
+libvorbis 1.0.0 (2002-07-19) -- "Xiph.Org libVorbis I 20020717"
+
+ * first stable release
+
diff --git a/media/libvorbis/COPYING b/media/libvorbis/COPYING
index 8f1d18cc2b..fb456a87bd 100644
--- a/media/libvorbis/COPYING
+++ b/media/libvorbis/COPYING
@@ -1,4 +1,4 @@
-Copyright (c) 2002-2015 Xiph.org Foundation
+Copyright (c) 2002-2020 Xiph.org Foundation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
diff --git a/media/libvorbis/README b/media/libvorbis/README
deleted file mode 100644
index 343be9a452..0000000000
--- a/media/libvorbis/README
+++ /dev/null
@@ -1,134 +0,0 @@
-********************************************************************
-* *
-* THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE. *
-* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
-* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
-* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
-* *
-* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
-* by the Xiph.org Foundation, http://www.xiph.org/ *
-* *
-********************************************************************
-
-Vorbis is a general purpose audio and music encoding format
-contemporary to MPEG-4's AAC and TwinVQ, the next generation beyond
-MPEG audio layer 3. Unlike the MPEG sponsored formats (and other
-proprietary formats such as RealAudio G2 and Windows' flavor of the
-month), the Vorbis CODEC specification belongs to the public domain.
-All the technical details are published and documented, and any
-software entity may make full use of the format without license
-fee, royalty or patent concerns.
-
-This package contains:
-
-* libvorbis, a BSD-style license software implementation of
- the Vorbis specification by the Xiph.Org Foundation
- (http://www.xiph.org/)
-
-* libvorbisfile, a BSD-style license convenience library
- built on Vorbis designed to simplify common uses
-
-* libvorbisenc, a BSD-style license library that provides a simple,
- programmatic encoding setup interface
-
-* example code making use of libogg, libvorbis, libvorbisfile and
- libvorbisenc
-
-WHAT'S HERE:
-
-This source distribution includes libvorbis and an example
-encoder/player to demonstrate use of libvorbis as well as
-documentation on the Ogg Vorbis audio coding format.
-
-You'll need libogg (distributed separately) to compile this library.
-A more comprehensive set of utilities is available in the vorbis-tools
-package.
-
-Directory:
-
-./lib The source for the libraries, a BSD-license implementation
- of the public domain Ogg Vorbis audio encoding format.
-
-./include Library API headers
-
-./debian Rules/spec files for building Debian .deb packages
-
-./doc Vorbis documentation
-
-./examples Example code illustrating programmatic use of libvorbis,
- libvorbisfile and libvorbisenc
-
-./mac Codewarrior project files and build tweaks for MacOS.
-
-./macosx Project files for MacOS X.
-
-./win32 Win32 projects files and build automation
-
-./vq Internal utilities for training/building new LSP/residue
- and auxiliary codebooks.
-
-CONTACT:
-
-The Ogg homepage is located at 'http://www.xiph.org/ogg/'.
-Vorbis's homepage is located at 'http://www.xiph.org/vorbis/'.
-Up to date technical documents, contact information, source code and
-pre-built utilities may be found there.
-
-The user website for Ogg Vorbis software and audio is http://vorbis.com/
-
-BUILDING FROM TRUNK:
-
-Development source is under subversion revision control at
-https://svn.xiph.org/trunk/vorbis/. You will also need the
-newest versions of autoconf, automake, libtool and pkg-config in
-order to compile Vorbis from development source. A configure script
-is provided for you in the source tarball distributions.
-
- [update or checkout latest source]
- ./autogen.sh
- make
-
-and as root if desired:
-
- make install
-
-This will install the Vorbis libraries (static and shared) into
-/usr/local/lib, includes into /usr/local/include and API manpages
-(once we write some) into /usr/local/man.
-
-Documentation building requires xsltproc and pdfxmltex.
-
-BUILDING FROM TARBALL DISTRIBUTIONS:
-
- ./configure
- make
-
-and optionally (as root):
- make install
-
-BUILDING RPMS:
-
-after normal configuring:
-
- make dist
- rpm -ta libvorbis-<version>.tar.gz
-
-BUILDING ON MACOS 9:
-
-Vorbis on MacOS 9 is built using Metroworks CodeWarrior. To build it,
-first verify that the Ogg libraries are already built following the
-instructions in the Ogg module README. Open vorbis/mac/libvorbis.mcp,
-switch to the "Targets" pane, select everything, and make the project.
-Do the same thing to build libvorbisenc.mcp, and libvorbisfile.mcp (in
-that order). In vorbis/mac/Output you will now have both debug and final
-versions of Vorbis shared libraries to link your projects against.
-
-To build a project using Ogg Vorbis, add access paths to your
-CodeWarrior project for the ogg/include, ogg/mac/Output,
-vorbis/include, and vorbis/mac/Output folders. Be sure that
-"interpret DOS and Unix paths" is turned on in your project; it can
-be found in the "access paths" pane in your project settings. Now
-simply add the shared libraries you need to your project (OggLib and
-VorbisLib at least) and #include "ogg/ogg.h" and "vorbis/codec.h"
-wherever you need to access Ogg and Vorbis functionality.
-
diff --git a/media/libvorbis/README.md b/media/libvorbis/README.md
new file mode 100644
index 0000000000..30c88d391d
--- /dev/null
+++ b/media/libvorbis/README.md
@@ -0,0 +1,147 @@
+# Vorbis
+
+[![GitLab Build Status](https://gitlab.xiph.org/xiph/vorbis/badges/master/pipeline.svg)](https://gitlab.xiph.org/xiph/vorbis/-/pipelines)
+[![Travis Build Status](https://travis-ci.org/xiph/vorbis.svg?branch=master)](https://travis-ci.org/xiph/vorbis)
+[![AppVeyor Build status](https://ci.appveyor.com/api/projects/status/github/xiph/vorbis?branch=master&svg=true)](https://ci.appveyor.com/project/rillian/vorbis)
+
+Vorbis is a general purpose audio and music encoding format
+contemporary to MPEG-4's AAC and TwinVQ, the next generation beyond
+MPEG audio layer 3. Unlike the MPEG sponsored formats (and other
+proprietary formats such as RealAudio G2 and Windows' flavor of the
+month), the Vorbis CODEC specification belongs to the public domain.
+All the technical details are published and documented, and any
+software entity may make full use of the format without license
+fee, royalty or patent concerns.
+
+This package contains:
+
+- libvorbis, a BSD-style license software implementation of
+ the Vorbis specification by the Xiph.Org Foundation
+ (https://xiph.org/)
+
+- libvorbisfile, a BSD-style license convenience library
+ built on Vorbis designed to simplify common uses
+
+- libvorbisenc, a BSD-style license library that provides a simple,
+ programmatic encoding setup interface
+
+- example code making use of libogg, libvorbis, libvorbisfile and
+ libvorbisenc
+
+## What's here ##
+
+This source distribution includes libvorbis and an example
+encoder/player to demonstrate use of libvorbis as well as
+documentation on the Ogg Vorbis audio coding format.
+
+You'll need libogg (distributed separately) to compile this library.
+A more comprehensive set of utilities is available in the vorbis-tools
+package.
+
+Directory:
+
+- `lib` The source for the libraries, a BSD-license implementation of the public domain Ogg Vorbis audio encoding format.
+
+- `include` Library API headers
+
+- `debian` Rules/spec files for building Debian .deb packages
+
+- `doc` Vorbis documentation
+
+- `examples` Example code illustrating programmatic use of libvorbis, libvorbisfile and libvorbisenc
+
+- `macosx` Project files for MacOS X.
+
+- `win32` Win32 projects files and build automation
+
+- `vq` Internal utilities for training/building new LSP/residue and auxiliary codebooks.
+
+## Contact ##
+
+The Ogg homepage is located at 'https://xiph.org/ogg/'.
+Vorbis's homepage is located at 'https://xiph.org/vorbis/'.
+Up to date technical documents, contact information, source code and
+pre-built utilities may be found there.
+
+## Building ##
+
+#### Building from master ####
+
+Development source is under git revision control at
+https://gitlab.xiph.org/xiph/vorbis.git. You will also need the
+newest versions of autoconf, automake, libtool and pkg-config in
+order to compile Vorbis from development source. A configure script
+is provided for you in the source tarball distributions.
+
+ ./autogen.sh
+ ./configure
+ make
+
+and as root if desired:
+
+ make install
+
+This will install the Vorbis libraries (static and shared) into
+/usr/local/lib, includes into /usr/local/include and API manpages
+(once we write some) into /usr/local/man.
+
+Documentation building requires xsltproc and pdfxmltex.
+
+#### Building from tarball distributions ####
+
+ ./configure
+ make
+
+and optionally (as root):
+
+ make install
+
+#### Building RPM packages ####
+
+after normal configuring:
+
+ make dist
+ rpm -ta libvorbis-<version>.tar.gz
+
+## Building with CMake ##
+
+Ogg supports building using [CMake](https://cmake.org/). CMake is a meta build system that generates native projects for each platform.
+To generate projects just run cmake replacing `YOUR-PROJECT-GENERATOR` with a proper generator from a list [here](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html):
+
+ cmake -G YOUR-PROJECT-GENERATOR .
+
+Note that by default cmake generates projects that will build static libraries.
+To generate projects that will build dynamic library use `BUILD_SHARED_LIBS` option like this:
+
+ cmake -G YOUR-PROJECT-GENERATOR -DBUILD_SHARED_LIBS=1 .
+
+After projects are generated use them as usual
+
+#### Building on Windows ####
+
+Use proper generator for your Visual Studio version like:
+
+ cmake -G "Visual Studio 12 2013" .
+
+#### Building on Mac OS X ####
+
+Use Xcode generator. To build framework run:
+
+ cmake -G Xcode -DBUILD_FRAMEWORK=1 .
+
+#### Building on Linux ####
+
+Use Makefile generator which is default one.
+
+ cmake .
+ make
+
+## License ##
+
+THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.
+USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS
+GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE
+IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.
+
+THE OggVorbis SOURCE CODE IS COPYRIGHT (C) 1994-2020
+by the Xiph.Org Foundation https://xiph.org/
diff --git a/media/libvorbis/README_MOZILLA b/media/libvorbis/README_MOZILLA
index 1211ac074b..34c86dad62 100644
--- a/media/libvorbis/README_MOZILLA
+++ b/media/libvorbis/README_MOZILLA
@@ -1,10 +1,10 @@
-The source from this directory was copied from the libvorbis
+The source from this directory was copied from the vorbis
subversion repository using the update.sh script. The only changes
made were those applied by update.sh and the addition/update of
Makefile.in and moz.build files for the Mozilla build system.
-The upstream version used was libvorbis 1.3.5.
-https://svn.xiph.org/tags/vorbis/libvorbis-1.3.5@19464
+The upstream version used was vorbis v1.3.7 (84c02369)
+from https://gitlab.xiph.org/xiph/vorbis
Some files are renamed during the copy to prevent clashes with object
file names with other Mozilla libraries.
diff --git a/media/libvorbis/include/vorbis/codec.h b/media/libvorbis/include/vorbis/codec.h
index 999aa33510..f8a912bc26 100644
--- a/media/libvorbis/include/vorbis/codec.h
+++ b/media/libvorbis/include/vorbis/codec.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
********************************************************************
function: libvorbis codec headers
- last mod: $Id: codec.h 17021 2010-03-24 09:29:41Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/include/vorbis/vorbisenc.h b/media/libvorbis/include/vorbis/vorbisenc.h
index 02332b50ca..085b15e669 100644
--- a/media/libvorbis/include/vorbis/vorbisenc.h
+++ b/media/libvorbis/include/vorbis/vorbisenc.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: vorbis encode-engine setup
- last mod: $Id: vorbisenc.h 17021 2010-03-24 09:29:41Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/backends.h b/media/libvorbis/lib/backends.h
index ff5bcc95fe..670b0b902e 100644
--- a/media/libvorbis/lib/backends.h
+++ b/media/libvorbis/lib/backends.h
@@ -6,13 +6,12 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: libvorbis backend and mapping structures; needed for
static mode headers
- last mod: $Id: backends.h 16962 2010-03-11 07:30:34Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/bitrate.h b/media/libvorbis/lib/bitrate.h
index db48fcb645..48fa150596 100644
--- a/media/libvorbis/lib/bitrate.h
+++ b/media/libvorbis/lib/bitrate.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: bitrate tracking and management
- last mod: $Id: bitrate.h 13293 2007-07-24 00:09:47Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/books/coupled/res_books_51.h b/media/libvorbis/lib/books/coupled/res_books_51.h
index 93910ff481..eb569c6f04 100644
--- a/media/libvorbis/lib/books/coupled/res_books_51.h
+++ b/media/libvorbis/lib/books/coupled/res_books_51.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
*
* function: static codebooks for 5.1 surround
- * last modified: $Id: res_books_51.h 19057 2014-01-22 12:32:31Z xiphmont $
*
********************************************************************/
diff --git a/media/libvorbis/lib/books/coupled/res_books_stereo.h b/media/libvorbis/lib/books/coupled/res_books_stereo.h
index 9a9049f6ed..7b53cb972b 100644
--- a/media/libvorbis/lib/books/coupled/res_books_stereo.h
+++ b/media/libvorbis/lib/books/coupled/res_books_stereo.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: static codebooks autogenerated by huff/huffbuld
- last modified: $Id: res_books_stereo.h 19057 2014-01-22 12:32:31Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/books/floor/floor_books.h b/media/libvorbis/lib/books/floor/floor_books.h
index e925313f7b..d26664f766 100644
--- a/media/libvorbis/lib/books/floor/floor_books.h
+++ b/media/libvorbis/lib/books/floor/floor_books.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: static codebooks autogenerated by huff/huffbuld
- last modified: $Id: floor_books.h 19057 2014-01-22 12:32:31Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/books/uncoupled/res_books_uncoupled.h b/media/libvorbis/lib/books/uncoupled/res_books_uncoupled.h
index 736353b675..107e22f9e3 100644
--- a/media/libvorbis/lib/books/uncoupled/res_books_uncoupled.h
+++ b/media/libvorbis/lib/books/uncoupled/res_books_uncoupled.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: static codebooks autogenerated by huff/huffbuld
- last modified: $Id: res_books_uncoupled.h 19057 2014-01-22 12:32:31Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/codebook.h b/media/libvorbis/lib/codebook.h
index 537d6c12d3..7d4e2aae4f 100644
--- a/media/libvorbis/lib/codebook.h
+++ b/media/libvorbis/lib/codebook.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: basic shared codebook operations
- last mod: $Id: codebook.h 19457 2015-03-03 00:15:29Z giles $
********************************************************************/
diff --git a/media/libvorbis/lib/codec_internal.h b/media/libvorbis/lib/codec_internal.h
index de1bccaedf..2ecf5e5c73 100644
--- a/media/libvorbis/lib/codec_internal.h
+++ b/media/libvorbis/lib/codec_internal.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: libvorbis codec headers
- last mod: $Id: codec_internal.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/envelope.h b/media/libvorbis/lib/envelope.h
index fd15fb32a7..2ef60a82ca 100644
--- a/media/libvorbis/lib/envelope.h
+++ b/media/libvorbis/lib/envelope.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: PCM data envelope analysis and manipulation
- last mod: $Id: envelope.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/highlevel.h b/media/libvorbis/lib/highlevel.h
index e38f370fd6..7690e3ebfb 100644
--- a/media/libvorbis/lib/highlevel.h
+++ b/media/libvorbis/lib/highlevel.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: highlevel encoder setup struct separated out for vorbisenc clarity
- last mod: $Id: highlevel.h 17195 2010-05-05 21:49:51Z giles $
********************************************************************/
diff --git a/media/libvorbis/lib/lookup.h b/media/libvorbis/lib/lookup.h
index f8b5b82730..ec05014f44 100644
--- a/media/libvorbis/lib/lookup.h
+++ b/media/libvorbis/lib/lookup.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: lookup based functions
- last mod: $Id: lookup.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/lookup_data.h b/media/libvorbis/lib/lookup_data.h
index 2424a1b386..7935715a70 100644
--- a/media/libvorbis/lib/lookup_data.h
+++ b/media/libvorbis/lib/lookup_data.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: lookup data; generated by lookups.pl; edit there
- last mod: $Id: lookup_data.h 16037 2009-05-26 21:10:58Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/lpc.h b/media/libvorbis/lib/lpc.h
index 39d237601b..4f59e6d32d 100644
--- a/media/libvorbis/lib/lpc.h
+++ b/media/libvorbis/lib/lpc.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: LPC low level routines
- last mod: $Id: lpc.h 16037 2009-05-26 21:10:58Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/lsp.h b/media/libvorbis/lib/lsp.h
index bacfb0971f..68b38daf16 100644
--- a/media/libvorbis/lib/lsp.h
+++ b/media/libvorbis/lib/lsp.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: LSP (also called LSF) conversion routines
- last mod: $Id: lsp.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/masking.h b/media/libvorbis/lib/masking.h
index 3576ab7885..7a196a37eb 100644
--- a/media/libvorbis/lib/masking.h
+++ b/media/libvorbis/lib/masking.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: masking curve data for psychoacoustics
- last mod: $Id: masking.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/mdct.h b/media/libvorbis/lib/mdct.h
index 3ed94333c5..ceaea617a3 100644
--- a/media/libvorbis/lib/mdct.h
+++ b/media/libvorbis/lib/mdct.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: modified discrete cosine transform prototypes
- last mod: $Id: mdct.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/misc.h b/media/libvorbis/lib/misc.h
index 73b4519898..eac5160e88 100644
--- a/media/libvorbis/lib/misc.h
+++ b/media/libvorbis/lib/misc.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: miscellaneous prototypes
- last mod: $Id: misc.h 19457 2015-03-03 00:15:29Z giles $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/floor_all.h b/media/libvorbis/lib/modes/floor_all.h
index 4292be326e..2e3d4a5012 100644
--- a/media/libvorbis/lib/modes/floor_all.h
+++ b/media/libvorbis/lib/modes/floor_all.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: key floor settings
- last mod: $Id: floor_all.h 17050 2010-03-26 01:34:42Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/psych_11.h b/media/libvorbis/lib/modes/psych_11.h
index 844a8ed3cd..9d8ed357ee 100644
--- a/media/libvorbis/lib/modes/psych_11.h
+++ b/media/libvorbis/lib/modes/psych_11.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: 11kHz settings
- last mod: $Id: psych_11.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/psych_16.h b/media/libvorbis/lib/modes/psych_16.h
index 1c10b3954e..49cbf7c4b2 100644
--- a/media/libvorbis/lib/modes/psych_16.h
+++ b/media/libvorbis/lib/modes/psych_16.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: 16kHz settings
- last mod: $Id: psych_16.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/psych_44.h b/media/libvorbis/lib/modes/psych_44.h
index f05c032653..d15509b71d 100644
--- a/media/libvorbis/lib/modes/psych_44.h
+++ b/media/libvorbis/lib/modes/psych_44.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: key psychoacoustic settings for 44.1/48kHz
- last mod: $Id: psych_44.h 16962 2010-03-11 07:30:34Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/psych_8.h b/media/libvorbis/lib/modes/psych_8.h
index 0e2dd57371..a19817f760 100644
--- a/media/libvorbis/lib/modes/psych_8.h
+++ b/media/libvorbis/lib/modes/psych_8.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: 8kHz psychoacoustic settings
- last mod: $Id: psych_8.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/residue_16.h b/media/libvorbis/lib/modes/residue_16.h
index dcaca5451e..15e161c862 100644
--- a/media/libvorbis/lib/modes/residue_16.h
+++ b/media/libvorbis/lib/modes/residue_16.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: toplevel residue templates 16/22kHz
- last mod: $Id: residue_16.h 16962 2010-03-11 07:30:34Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/residue_44.h b/media/libvorbis/lib/modes/residue_44.h
index 236c18341b..3f982695a7 100644
--- a/media/libvorbis/lib/modes/residue_44.h
+++ b/media/libvorbis/lib/modes/residue_44.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: toplevel residue templates for 32/44.1/48kHz
- last mod: $Id: residue_44.h 16962 2010-03-11 07:30:34Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/residue_44p51.h b/media/libvorbis/lib/modes/residue_44p51.h
index a52cc5245e..8ac5f65e62 100644
--- a/media/libvorbis/lib/modes/residue_44p51.h
+++ b/media/libvorbis/lib/modes/residue_44p51.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: toplevel residue templates for 32/44.1/48kHz uncoupled
- last mod: $Id: residue_44p51.h 19013 2013-11-12 04:04:50Z giles $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/residue_44u.h b/media/libvorbis/lib/modes/residue_44u.h
index 92c4a09ce3..2f3595e49f 100644
--- a/media/libvorbis/lib/modes/residue_44u.h
+++ b/media/libvorbis/lib/modes/residue_44u.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: toplevel residue templates for 32/44.1/48kHz uncoupled
- last mod: $Id: residue_44u.h 16962 2010-03-11 07:30:34Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/residue_8.h b/media/libvorbis/lib/modes/residue_8.h
index 94c6d84c44..b836f79c84 100644
--- a/media/libvorbis/lib/modes/residue_8.h
+++ b/media/libvorbis/lib/modes/residue_8.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: toplevel residue templates 8/11kHz
- last mod: $Id: residue_8.h 16962 2010-03-11 07:30:34Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/setup_11.h b/media/libvorbis/lib/modes/setup_11.h
index 4c2d619ca2..5ade5dd169 100644
--- a/media/libvorbis/lib/modes/setup_11.h
+++ b/media/libvorbis/lib/modes/setup_11.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: 11kHz settings
- last mod: $Id: setup_11.h 16894 2010-02-12 20:32:12Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/setup_16.h b/media/libvorbis/lib/modes/setup_16.h
index 336007f98e..8b2daafa3f 100644
--- a/media/libvorbis/lib/modes/setup_16.h
+++ b/media/libvorbis/lib/modes/setup_16.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: 16kHz settings
- last mod: $Id: setup_16.h 16894 2010-02-12 20:32:12Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/setup_22.h b/media/libvorbis/lib/modes/setup_22.h
index 4fd5e57111..eef5a4e7da 100644
--- a/media/libvorbis/lib/modes/setup_22.h
+++ b/media/libvorbis/lib/modes/setup_22.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: 22kHz settings
- last mod: $Id: setup_22.h 17026 2010-03-25 05:00:27Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/setup_32.h b/media/libvorbis/lib/modes/setup_32.h
index 2275ac9615..f87cb767d0 100644
--- a/media/libvorbis/lib/modes/setup_32.h
+++ b/media/libvorbis/lib/modes/setup_32.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: toplevel settings for 32kHz
- last mod: $Id: setup_32.h 16894 2010-02-12 20:32:12Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/setup_44.h b/media/libvorbis/lib/modes/setup_44.h
index 3b88a89ac5..12d592808e 100644
--- a/media/libvorbis/lib/modes/setup_44.h
+++ b/media/libvorbis/lib/modes/setup_44.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: toplevel settings for 44.1/48kHz
- last mod: $Id: setup_44.h 16962 2010-03-11 07:30:34Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/setup_44p51.h b/media/libvorbis/lib/modes/setup_44p51.h
index 67d9979608..4d49173ffb 100644
--- a/media/libvorbis/lib/modes/setup_44p51.h
+++ b/media/libvorbis/lib/modes/setup_44p51.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: toplevel settings for 44.1/48kHz 5.1 surround modes
- last mod: $Id: setup_44p51.h 19013 2013-11-12 04:04:50Z giles $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/setup_44u.h b/media/libvorbis/lib/modes/setup_44u.h
index 568b5f8959..2dd8bf701f 100644
--- a/media/libvorbis/lib/modes/setup_44u.h
+++ b/media/libvorbis/lib/modes/setup_44u.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: toplevel settings for 44.1/48kHz uncoupled modes
- last mod: $Id: setup_44u.h 16962 2010-03-11 07:30:34Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/setup_8.h b/media/libvorbis/lib/modes/setup_8.h
index 14c48374fa..16b02e01b7 100644
--- a/media/libvorbis/lib/modes/setup_8.h
+++ b/media/libvorbis/lib/modes/setup_8.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: 8kHz settings
- last mod: $Id: setup_8.h 16894 2010-02-12 20:32:12Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/modes/setup_X.h b/media/libvorbis/lib/modes/setup_X.h
index a69f5d40a2..27807c10b4 100644
--- a/media/libvorbis/lib/modes/setup_X.h
+++ b/media/libvorbis/lib/modes/setup_X.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: catch-all toplevel settings for q modes only
- last mod: $Id: setup_X.h 16894 2010-02-12 20:32:12Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/os.h b/media/libvorbis/lib/os.h
index 8bc3e5fe9c..9ded7358d4 100644
--- a/media/libvorbis/lib/os.h
+++ b/media/libvorbis/lib/os.h
@@ -8,12 +8,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: #ifdef jail to whip a few platforms into the UNIX ideal.
- last mod: $Id: os.h 19457 2015-03-03 00:15:29Z giles $
********************************************************************/
@@ -31,7 +30,7 @@
# ifdef __GNUC__
# define STIN static __inline__
-# elif _WIN32
+# elif defined(_WIN32)
# define STIN static __inline
# else
# define STIN static
@@ -61,7 +60,7 @@ void *_alloca(size_t size);
# define FAST_HYPOT hypot
#endif
-#endif
+#endif /* _V_IFDEFJAIL_H_ */
#ifdef HAVE_ALLOCA_H
# include <alloca.h>
@@ -81,7 +80,7 @@ void *_alloca(size_t size);
/* Special i386 GCC implementation */
-#if defined(__i386__) && defined(__GNUC__) && !defined(__BEOS__)
+#if defined(__i386__) && defined(__GNUC__) && !defined(__BEOS__) && !defined(__SSE2_MATH__)
# define VORBIS_FPU_CONTROL
/* both GCC and MSVC are kinda stupid about rounding/casting to int.
Because of encapsulation constraints (GCC can't see inside the asm
@@ -120,8 +119,7 @@ static inline int vorbis_ftoi(double f){ /* yes, double! Otherwise,
/* MSVC inline assembly. 32 bit only; inline ASM isn't implemented in the
* 64 bit compiler and doesn't work on arm. */
-#if defined(_MSC_VER) && !defined(_WIN64) && \
- !defined(_WIN32_WCE) && !defined(_M_ARM)
+#if defined(_MSC_VER) && defined(_M_IX86) && !defined(_WIN32_WCE)
# define VORBIS_FPU_CONTROL
typedef ogg_int16_t vorbis_fpu_control;
@@ -148,7 +146,7 @@ static __inline void vorbis_fpu_restore(vorbis_fpu_control fpu){
/* Optimized code path for x86_64 builds. Uses SSE2 intrinsics. This can be
done safely because all x86_64 CPUs supports SSE2. */
-#if (defined(_MSC_VER) && defined(_WIN64)) || (defined(__GNUC__) && defined (__x86_64__))
+#if (defined(_MSC_VER) && defined(_M_X64)) || (defined(__GNUC__) && defined (__SSE2_MATH__))
# define VORBIS_FPU_CONTROL
typedef ogg_int16_t vorbis_fpu_control;
@@ -175,7 +173,7 @@ static __inline void vorbis_fpu_restore(vorbis_fpu_control fpu){
typedef int vorbis_fpu_control;
-static int vorbis_ftoi(double f){
+STIN int vorbis_ftoi(double f){
/* Note: MSVC and GCC (at least on some systems) round towards zero, thus,
the floor() call is required to ensure correct roudning of
negative numbers */
diff --git a/media/libvorbis/lib/psy.h b/media/libvorbis/lib/psy.h
index c1ea824401..d9a04e8b74 100644
--- a/media/libvorbis/lib/psy.h
+++ b/media/libvorbis/lib/psy.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: random psychoacoustics (not including preecho)
- last mod: $Id: psy.h 16946 2010-03-03 16:12:40Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/registry.h b/media/libvorbis/lib/registry.h
index 3ae04776d8..b823aa6091 100644
--- a/media/libvorbis/lib/registry.h
+++ b/media/libvorbis/lib/registry.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: registry for time, floor, res backends and channel mappings
- last mod: $Id: registry.h 15531 2008-11-24 23:50:06Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/scales.h b/media/libvorbis/lib/scales.h
index 613f796e77..3c2ae48d9e 100644
--- a/media/libvorbis/lib/scales.h
+++ b/media/libvorbis/lib/scales.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: linear scale -> dB, Bark and Mel scales
- last mod: $Id: scales.h 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/smallft.h b/media/libvorbis/lib/smallft.h
index 456497326c..02fe8f9cd4 100644
--- a/media/libvorbis/lib/smallft.h
+++ b/media/libvorbis/lib/smallft.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: fft transform
- last mod: $Id: smallft.h 13293 2007-07-24 00:09:47Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_analysis.c b/media/libvorbis/lib/vorbis_analysis.c
index 01aa6f30db..14919737eb 100644
--- a/media/libvorbis/lib/vorbis_analysis.c
+++ b/media/libvorbis/lib/vorbis_analysis.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: single-block PCM analysis mode dispatch
- last mod: $Id: analysis.c 16226 2009-07-08 06:43:49Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_bitrate.c b/media/libvorbis/lib/vorbis_bitrate.c
index 3a71b1dc23..132553cbee 100644
--- a/media/libvorbis/lib/vorbis_bitrate.c
+++ b/media/libvorbis/lib/vorbis_bitrate.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: bitrate tracking and management
- last mod: $Id: bitrate.c 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_block.c b/media/libvorbis/lib/vorbis_block.c
index 345c042769..6a50da0843 100644
--- a/media/libvorbis/lib/vorbis_block.c
+++ b/media/libvorbis/lib/vorbis_block.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: PCM data vector blocking, windowing and dis/reassembly
- last mod: $Id: block.c 19457 2015-03-03 00:15:29Z giles $
Handle windowing, overlap-add, etc of the PCM vectors. This is made
more amusing by Vorbis' current two allowed block sizes.
diff --git a/media/libvorbis/lib/vorbis_codebook.c b/media/libvorbis/lib/vorbis_codebook.c
index 49b7b09b27..7a0c206783 100644
--- a/media/libvorbis/lib/vorbis_codebook.c
+++ b/media/libvorbis/lib/vorbis_codebook.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: basic codebook pack/unpack/code/decode operations
- last mod: $Id: codebook.c 19457 2015-03-03 00:15:29Z giles $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_envelope.c b/media/libvorbis/lib/vorbis_envelope.c
index 010c66e2d6..22d39aa6e0 100644
--- a/media/libvorbis/lib/vorbis_envelope.c
+++ b/media/libvorbis/lib/vorbis_envelope.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: PCM data envelope analysis
- last mod: $Id: envelope.c 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_floor0.c b/media/libvorbis/lib/vorbis_floor0.c
index 213cce4ec8..f4a6d4d559 100644
--- a/media/libvorbis/lib/vorbis_floor0.c
+++ b/media/libvorbis/lib/vorbis_floor0.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: floor backend 0 implementation
- last mod: $Id: floor0.c 19457 2015-03-03 00:15:29Z giles $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_floor1.c b/media/libvorbis/lib/vorbis_floor1.c
index d8bd4645c1..c4fe3ea7e7 100644
--- a/media/libvorbis/lib/vorbis_floor1.c
+++ b/media/libvorbis/lib/vorbis_floor1.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: floor backend 1 implementation
- last mod: $Id: floor1.c 19457 2015-03-03 00:15:29Z giles $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_info.c b/media/libvorbis/lib/vorbis_info.c
index 8a2a001f99..f2e39e387e 100644
--- a/media/libvorbis/lib/vorbis_info.c
+++ b/media/libvorbis/lib/vorbis_info.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: maintain the info structure, info <-> header packets
- last mod: $Id: info.c 19441 2015-01-21 01:17:41Z xiphmont $
********************************************************************/
@@ -20,7 +19,6 @@
#include <stdlib.h>
#include <string.h>
-#include <ctype.h>
#include <ogg/ogg.h>
#include "vorbis/codec.h"
#include "codec_internal.h"
@@ -31,8 +29,8 @@
#include "misc.h"
#include "os.h"
-#define GENERAL_VENDOR_STRING "Xiph.Org libVorbis 1.3.5"
-#define ENCODE_VENDOR_STRING "Xiph.Org libVorbis I 20150105 (⛄⛄⛄⛄)"
+#define GENERAL_VENDOR_STRING "Xiph.Org libVorbis 1.3.7"
+#define ENCODE_VENDOR_STRING "Xiph.Org libVorbis I 20200704 (Reducing Environment)"
/* helpers */
static void _v_writestring(oggpack_buffer *o,const char *s, int bytes){
@@ -48,6 +46,10 @@ static void _v_readstring(oggpack_buffer *o,char *buf,int bytes){
}
}
+static int _v_toupper(int c) {
+ return (c >= 'a' && c <= 'z') ? (c & ~('a' - 'A')) : c;
+}
+
void vorbis_comment_init(vorbis_comment *vc){
memset(vc,0,sizeof(*vc));
}
@@ -65,11 +67,13 @@ void vorbis_comment_add(vorbis_comment *vc,const char *comment){
}
void vorbis_comment_add_tag(vorbis_comment *vc, const char *tag, const char *contents){
- char *comment=alloca(strlen(tag)+strlen(contents)+2); /* +2 for = and \0 */
+ /* Length for key and value +2 for = and \0 */
+ char *comment=_ogg_malloc(strlen(tag)+strlen(contents)+2);
strcpy(comment, tag);
strcat(comment, "=");
strcat(comment, contents);
vorbis_comment_add(vc, comment);
+ _ogg_free(comment);
}
/* This is more or less the same as strncasecmp - but that doesn't exist
@@ -77,7 +81,7 @@ void vorbis_comment_add_tag(vorbis_comment *vc, const char *tag, const char *con
static int tagcompare(const char *s1, const char *s2, int n){
int c=0;
while(c < n){
- if(toupper(s1[c]) != toupper(s2[c]))
+ if(_v_toupper(s1[c]) != _v_toupper(s2[c]))
return !0;
c++;
}
@@ -88,27 +92,30 @@ char *vorbis_comment_query(vorbis_comment *vc, const char *tag, int count){
long i;
int found = 0;
int taglen = strlen(tag)+1; /* +1 for the = we append */
- char *fulltag = alloca(taglen+ 1);
+ char *fulltag = _ogg_malloc(taglen+1);
strcpy(fulltag, tag);
strcat(fulltag, "=");
for(i=0;i<vc->comments;i++){
if(!tagcompare(vc->user_comments[i], fulltag, taglen)){
- if(count == found)
+ if(count == found) {
/* We return a pointer to the data, not a copy */
- return vc->user_comments[i] + taglen;
- else
+ _ogg_free(fulltag);
+ return vc->user_comments[i] + taglen;
+ } else {
found++;
+ }
}
}
+ _ogg_free(fulltag);
return NULL; /* didn't find anything */
}
int vorbis_comment_query_count(vorbis_comment *vc, const char *tag){
int i,count=0;
int taglen = strlen(tag)+1; /* +1 for the = we append */
- char *fulltag = alloca(taglen+1);
+ char *fulltag = _ogg_malloc(taglen+1);
strcpy(fulltag,tag);
strcat(fulltag, "=");
@@ -117,6 +124,7 @@ int vorbis_comment_query_count(vorbis_comment *vc, const char *tag){
count++;
}
+ _ogg_free(fulltag);
return count;
}
@@ -198,6 +206,7 @@ void vorbis_info_clear(vorbis_info *vi){
static int _vorbis_unpack_info(vorbis_info *vi,oggpack_buffer *opb){
codec_setup_info *ci=vi->codec_setup;
+ int bs;
if(!ci)return(OV_EFAULT);
vi->version=oggpack_read(opb,32);
@@ -206,12 +215,16 @@ static int _vorbis_unpack_info(vorbis_info *vi,oggpack_buffer *opb){
vi->channels=oggpack_read(opb,8);
vi->rate=oggpack_read(opb,32);
- vi->bitrate_upper=oggpack_read(opb,32);
- vi->bitrate_nominal=oggpack_read(opb,32);
- vi->bitrate_lower=oggpack_read(opb,32);
+ vi->bitrate_upper=(ogg_int32_t)oggpack_read(opb,32);
+ vi->bitrate_nominal=(ogg_int32_t)oggpack_read(opb,32);
+ vi->bitrate_lower=(ogg_int32_t)oggpack_read(opb,32);
- ci->blocksizes[0]=1<<oggpack_read(opb,4);
- ci->blocksizes[1]=1<<oggpack_read(opb,4);
+ bs = oggpack_read(opb,4);
+ if(bs<0)goto err_out;
+ ci->blocksizes[0]=1<<bs;
+ bs = oggpack_read(opb,4);
+ if(bs<0)goto err_out;
+ ci->blocksizes[1]=1<<bs;
if(vi->rate<1)goto err_out;
if(vi->channels<1)goto err_out;
@@ -583,7 +596,8 @@ int vorbis_analysis_headerout(vorbis_dsp_state *v,
oggpack_buffer opb;
private_state *b=v->backend_state;
- if(!b||vi->channels<=0){
+ if(!b||vi->channels<=0||vi->channels>256){
+ b = NULL;
ret=OV_EFAULT;
goto err_out;
}
@@ -642,7 +656,7 @@ int vorbis_analysis_headerout(vorbis_dsp_state *v,
memset(op_code,0,sizeof(*op_code));
if(b){
- oggpack_writeclear(&opb);
+ if(vi->channels>0)oggpack_writeclear(&opb);
if(b->header)_ogg_free(b->header);
if(b->header1)_ogg_free(b->header1);
if(b->header2)_ogg_free(b->header2);
diff --git a/media/libvorbis/lib/vorbis_lookup.c b/media/libvorbis/lib/vorbis_lookup.c
index 3321ed3dbc..7cd01a44d3 100644
--- a/media/libvorbis/lib/vorbis_lookup.c
+++ b/media/libvorbis/lib/vorbis_lookup.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: lookup based functions
- last mod: $Id: lookup.c 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_lpc.c b/media/libvorbis/lib/vorbis_lpc.c
index f5199ec235..877da47f8e 100644
--- a/media/libvorbis/lib/vorbis_lpc.c
+++ b/media/libvorbis/lib/vorbis_lpc.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: LPC low level routines
- last mod: $Id: lpc.c 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_lsp.c b/media/libvorbis/lib/vorbis_lsp.c
index bf5ab861c9..a92882a131 100644
--- a/media/libvorbis/lib/vorbis_lsp.c
+++ b/media/libvorbis/lib/vorbis_lsp.c
@@ -6,19 +6,19 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: LSP (also called LSF) conversion routines
- last mod: $Id: lsp.c 19453 2015-03-02 22:35:34Z xiphmont $
The LSP generation code is taken (with minimal modification and a
few bugfixes) from "On the Computation of the LSP Frequencies" by
Joseph Rothweiler (see http://www.rothweiler.us for contact info).
+
The paper is available at:
- http://www.myown1.com/joe/lsf
+ https://web.archive.org/web/20110810174000/http://home.myfairpoint.net/vzenxj75/myown1/joe/lsf/index.html
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_mapping0.c b/media/libvorbis/lib/vorbis_mapping0.c
index 85c7d22d83..efa0fbcd93 100644
--- a/media/libvorbis/lib/vorbis_mapping0.c
+++ b/media/libvorbis/lib/vorbis_mapping0.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: channel mapping 0 implementation
- last mod: $Id: mapping0.c 19441 2015-01-21 01:17:41Z xiphmont $
********************************************************************/
@@ -93,7 +92,6 @@ static vorbis_info_mapping *mapping0_unpack(vorbis_info *vi,oggpack_buffer *opb)
int i,b;
vorbis_info_mapping0 *info=_ogg_calloc(1,sizeof(*info));
codec_setup_info *ci=vi->codec_setup;
- memset(info,0,sizeof(*info));
if(vi->channels<=0)goto err_out;
b=oggpack_read(opb,1);
diff --git a/media/libvorbis/lib/vorbis_mdct.c b/media/libvorbis/lib/vorbis_mdct.c
index 0816331805..2a0ff8d01b 100644
--- a/media/libvorbis/lib/vorbis_mdct.c
+++ b/media/libvorbis/lib/vorbis_mdct.c
@@ -6,13 +6,12 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: normalized modified discrete cosine transform
power of two length transform only [64 <= n ]
- last mod: $Id: mdct.c 16227 2009-07-08 06:58:46Z xiphmont $
Original algorithm adapted long ago from _The use of multirate filter
banks for coding of high quality digital audio_, by T. Sporer,
diff --git a/media/libvorbis/lib/vorbis_psy.c b/media/libvorbis/lib/vorbis_psy.c
index f7a44c6d00..036b094aa7 100644
--- a/media/libvorbis/lib/vorbis_psy.c
+++ b/media/libvorbis/lib/vorbis_psy.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: psychoacoustics not including preecho
- last mod: $Id: psy.c 18077 2011-09-02 02:49:00Z giles $
********************************************************************/
@@ -600,11 +599,12 @@ static void bark_noise_hybridmp(int n,const long *b,
XY[i] = tXY;
}
- for (i = 0, x = 0.f;; i++, x += 1.f) {
+ for (i = 0, x = 0.f; i < n; i++, x += 1.f) {
lo = b[i] >> 16;
- if( lo>=0 ) break;
hi = b[i] & 0xffff;
+ if( lo>=0 || -lo>=n ) break;
+ if( hi>=n ) break;
tN = N[hi] + N[-lo];
tX = X[hi] - X[-lo];
@@ -616,17 +616,17 @@ static void bark_noise_hybridmp(int n,const long *b,
B = tN * tXY - tX * tY;
D = tN * tXX - tX * tX;
R = (A + x * B) / D;
- if (R < 0.f)
- R = 0.f;
+ if (R < 0.f) R = 0.f;
noise[i] = R - offset;
}
- for ( ;; i++, x += 1.f) {
+ for ( ; i < n; i++, x += 1.f) {
lo = b[i] >> 16;
hi = b[i] & 0xffff;
- if(hi>=n)break;
+ if( lo<0 || lo>=n ) break;
+ if( hi>=n ) break;
tN = N[hi] - N[lo];
tX = X[hi] - X[lo];
@@ -642,6 +642,7 @@ static void bark_noise_hybridmp(int n,const long *b,
noise[i] = R - offset;
}
+
for ( ; i < n; i++, x += 1.f) {
R = (A + x * B) / D;
@@ -652,10 +653,11 @@ static void bark_noise_hybridmp(int n,const long *b,
if (fixed <= 0) return;
- for (i = 0, x = 0.f;; i++, x += 1.f) {
+ for (i = 0, x = 0.f; i < n; i++, x += 1.f) {
hi = i + fixed / 2;
lo = hi - fixed;
- if(lo>=0)break;
+ if ( hi>=n ) break;
+ if ( lo>=0 ) break;
tN = N[hi] + N[-lo];
tX = X[hi] - X[-lo];
@@ -671,11 +673,12 @@ static void bark_noise_hybridmp(int n,const long *b,
if (R - offset < noise[i]) noise[i] = R - offset;
}
- for ( ;; i++, x += 1.f) {
+ for ( ; i < n; i++, x += 1.f) {
hi = i + fixed / 2;
lo = hi - fixed;
- if(hi>=n)break;
+ if ( hi>=n ) break;
+ if ( lo<0 ) break;
tN = N[hi] - N[lo];
tX = X[hi] - X[lo];
diff --git a/media/libvorbis/lib/vorbis_registry.c b/media/libvorbis/lib/vorbis_registry.c
index 3961ed1403..db0f67b2e2 100644
--- a/media/libvorbis/lib/vorbis_registry.c
+++ b/media/libvorbis/lib/vorbis_registry.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: registry for time, floor, res backends and channel mappings
- last mod: $Id: registry.c 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_res0.c b/media/libvorbis/lib/vorbis_res0.c
index ec11488c2f..c931aded38 100644
--- a/media/libvorbis/lib/vorbis_res0.c
+++ b/media/libvorbis/lib/vorbis_res0.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: residue backend 0, 1 and 2 implementation
- last mod: $Id: res0.c 19441 2015-01-21 01:17:41Z xiphmont $
********************************************************************/
@@ -31,9 +30,6 @@
#include "misc.h"
#include "os.h"
-//#define TRAIN_RES 1
-//#define TRAIN_RESAUX 1
-
#if defined(TRAIN_RES) || defined (TRAIN_RESAUX)
#include <stdio.h>
#endif
diff --git a/media/libvorbis/lib/vorbis_sharedbook.c b/media/libvorbis/lib/vorbis_sharedbook.c
index 6bfdf7311e..444f42b5aa 100644
--- a/media/libvorbis/lib/vorbis_sharedbook.c
+++ b/media/libvorbis/lib/vorbis_sharedbook.c
@@ -6,16 +6,16 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: basic shared codebook operations
- last mod: $Id: sharedbook.c 19457 2015-03-03 00:15:29Z giles $
********************************************************************/
#include <stdlib.h>
+#include <limits.h>
#include <math.h>
#include <string.h>
#include <ogg/ogg.h>
@@ -50,7 +50,7 @@ long _float32_pack(float val){
sign=0x80000000;
val= -val;
}
- exp= floor(log(val)/log(2.f)+.001); //+epsilon
+ exp= floor(log(val)/log(2.f)+.001); /* +epsilon */
mant=rint(ldexp(val,(VQ_FMAN-1)-exp));
exp=(exp+VQ_FEXP_BIAS)<<VQ_FMAN;
@@ -62,7 +62,15 @@ float _float32_unpack(long val){
int sign=val&0x80000000;
long exp =(val&0x7fe00000L)>>VQ_FMAN;
if(sign)mant= -mant;
- return(ldexp(mant,exp-(VQ_FMAN-1)-VQ_FEXP_BIAS));
+ exp=exp-(VQ_FMAN-1)-VQ_FEXP_BIAS;
+ /* clamp excessive exponent values */
+ if (exp>63){
+ exp=63;
+ }
+ if (exp<-63){
+ exp=-63;
+ }
+ return(ldexp(mant,exp));
}
/* given a list of word lengths, generate a list of codewords. Works
@@ -158,25 +166,34 @@ ogg_uint32_t *_make_words(char *l,long n,long sparsecount){
that's portable and totally safe against roundoff, but I haven't
thought of it. Therefore, we opt on the side of caution */
long _book_maptype1_quantvals(const static_codebook *b){
- long vals=floor(pow((float)b->entries,1.f/b->dim));
+ long vals;
+ if(b->entries<1){
+ return(0);
+ }
+ vals=floor(pow((float)b->entries,1.f/b->dim));
/* the above *should* be reliable, but we'll not assume that FP is
ever reliable when bitstream sync is at stake; verify via integer
means that vals really is the greatest value of dim for which
vals^b->bim <= b->entries */
/* treat the above as an initial guess */
+ if(vals<1){
+ vals=1;
+ }
while(1){
long acc=1;
long acc1=1;
int i;
for(i=0;i<b->dim;i++){
+ if(b->entries/vals<acc)break;
acc*=vals;
- acc1*=vals+1;
+ if(LONG_MAX/(vals+1)<acc1)acc1=LONG_MAX;
+ else acc1*=vals+1;
}
- if(acc<=b->entries && acc1>b->entries){
+ if(i>=b->dim && acc<=b->entries && acc1>b->entries){
return(vals);
}else{
- if(acc>b->entries){
+ if(i<b->dim || acc>b->entries){
vals--;
}else{
vals++;
@@ -285,7 +302,7 @@ int vorbis_book_init_encode(codebook *c,const static_codebook *s){
c->used_entries=s->entries;
c->dim=s->dim;
c->codelist=_make_words(s->lengthlist,s->entries,0);
- //c->valuelist=_book_unquantize(s,s->entries,NULL);
+ /* c->valuelist=_book_unquantize(s,s->entries,NULL); */
c->quantvals=_book_maptype1_quantvals(s);
c->minval=(int)rint(_float32_unpack(s->q_min));
c->delta=(int)rint(_float32_unpack(s->q_delta));
@@ -564,6 +581,7 @@ void run_test(static_codebook *b,float *comp){
exit(1);
}
}
+ _ogg_free(out);
}
int main(){
diff --git a/media/libvorbis/lib/vorbis_smallft.c b/media/libvorbis/lib/vorbis_smallft.c
index ae2bc41b6b..4ffabab4bb 100644
--- a/media/libvorbis/lib/vorbis_smallft.c
+++ b/media/libvorbis/lib/vorbis_smallft.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: *unnormalized* fft transform
- last mod: $Id: smallft.c 16227 2009-07-08 06:58:46Z xiphmont $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbis_synthesis.c b/media/libvorbis/lib/vorbis_synthesis.c
index 932d271a63..3e2d681270 100644
--- a/media/libvorbis/lib/vorbis_synthesis.c
+++ b/media/libvorbis/lib/vorbis_synthesis.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: single-block PCM synthesis
- last mod: $Id: synthesis.c 19441 2015-01-21 01:17:41Z xiphmont $
********************************************************************/
@@ -117,7 +116,7 @@ int vorbis_synthesis_trackonly(vorbis_block *vb,ogg_packet *op){
if(!ci->mode_param[mode]){
return(OV_EBADPACKET);
}
-
+
vb->W=ci->mode_param[mode]->blockflag;
if(vb->W){
vb->lW=oggpack_read(opb,1);
diff --git a/media/libvorbis/lib/vorbis_window.c b/media/libvorbis/lib/vorbis_window.c
index 0305b79297..2151b278d1 100644
--- a/media/libvorbis/lib/vorbis_window.c
+++ b/media/libvorbis/lib/vorbis_window.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: window functions
- last mod: $Id: window.c 19028 2013-12-02 23:23:39Z tterribe $
********************************************************************/
diff --git a/media/libvorbis/lib/vorbisenc.c b/media/libvorbis/lib/vorbisenc.c
index b5d621e900..cf3806a6e1 100644
--- a/media/libvorbis/lib/vorbisenc.c
+++ b/media/libvorbis/lib/vorbisenc.c
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: simple programmatic interface for encoder mode setup
- last mod: $Id: vorbisenc.c 19457 2015-03-03 00:15:29Z giles $
********************************************************************/
@@ -685,6 +684,7 @@ int vorbis_encode_setup_init(vorbis_info *vi){
highlevel_encode_setup *hi=&ci->hi;
if(ci==NULL)return(OV_EINVAL);
+ if(vi->channels<1||vi->channels>255)return(OV_EINVAL);
if(!hi->impulse_block_p)i0=1;
/* too low/high an ATH floater is nonsensical, but doesn't break anything */
@@ -1211,7 +1211,7 @@ int vorbis_encode_ctl(vorbis_info *vi,int number,void *arg){
hi->req,
hi->managed,
&new_base);
- if(!hi->setup)return OV_EIMPL;
+ if(!new_template)return OV_EIMPL;
hi->setup=new_template;
hi->base_setting=new_base;
vorbis_encode_setup_setting(vi,vi->channels,vi->rate);
diff --git a/media/libvorbis/lib/window.h b/media/libvorbis/lib/window.h
index 51f97599f5..33d83f85f9 100644
--- a/media/libvorbis/lib/window.h
+++ b/media/libvorbis/lib/window.h
@@ -6,12 +6,11 @@
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * by the Xiph.Org Foundation https://xiph.org/ *
* *
********************************************************************
function: window functions
- last mod: $Id: window.h 19028 2013-12-02 23:23:39Z tterribe $
********************************************************************/
diff --git a/media/libvorbis/todo.txt b/media/libvorbis/todo.txt
deleted file mode 100644
index b0e1f93cda..0000000000
--- a/media/libvorbis/todo.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-Open project list for further development:
-
-libvorbis:
-
-Meaningful error code returns
-
-still some padding at EOS
-
-Option for brute-forcing vq search on maptype 2 (helps on undertrained
-sets).
-
-encoder switch interface for binary compat through changes; ioctl()-like?
-
-API changes:
- break up some of the more monolithic calls (eg, allow access
- to MDCT domain data, additional low level framing capability)
- convenience calls for text comments
-
-other:
-
-command line suite
-'crashme'
diff --git a/media/libvorbis/update.sh b/media/libvorbis/update.sh
index ae82a8d8b3..b2ad0b23cb 100644..100755
--- a/media/libvorbis/update.sh
+++ b/media/libvorbis/update.sh
@@ -1,4 +1,8 @@
-# Usage: /bin/sh update.sh <vorbis_src_directory>
+#!/bin/sh
+if test $# -ne 1; then
+ echo "Usage: /bin/sh update.sh <vorbis_src_directory>"
+ exit 1
+fi
#
# Copies the needed files from a directory containing the original
# libvorbis source that we need for the Mozilla HTML5 media support.
@@ -44,9 +48,9 @@ cp $1/lib/codebook.c ./lib/vorbis_codebook.c
cp $1/lib/bitrate.c ./lib/vorbis_bitrate.c
cp $1/lib/block.c ./lib/vorbis_block.c
cp $1/include/vorbis/codec.h ./include/vorbis/codec.h
-cp $1/todo.txt ./todo.txt
+cp $1/CHANGES ./CHANGES
cp $1/COPYING ./COPYING
-cp $1/README ./README
+cp $1/README.md ./README.md
cp $1/AUTHORS ./AUTHORS
# Encoder support
@@ -81,4 +85,4 @@ cp $1/lib/books/floor/floor_books.h ./lib/books/floor/
cp $1/lib/books/uncoupled/res_books_uncoupled.h ./lib/books/uncoupled/
# Add any patches against upstream here.
-# ...nothing to apply...
+
diff --git a/media/libwebp/AUTHORS b/media/libwebp/AUTHORS
index 0d70b7fb2a..8307c2099d 100644
--- a/media/libwebp/AUTHORS
+++ b/media/libwebp/AUTHORS
@@ -1,9 +1,15 @@
Contributors:
+- Aidan O'Loan (aidanol at gmail dot com)
- Alan Browning (browning at google dot com)
- Charles Munger (clm at google dot com)
+- Cheng Yi (cyi at google dot com)
- Christian Duvivier (cduvivier at google dot com)
+- Christopher Degawa (ccom at randomderp dot com)
+- Clement Courbet (courbet at google dot com)
- Djordje Pesut (djordje dot pesut at imgtec dot com)
- Hui Su (huisu at google dot com)
+- Ilya Kurdyukov (jpegqs at gmail dot com)
+- Ingvar Stepanyan (rreverser at google dot com)
- James Zern (jzern at google dot com)
- Jan Engelhardt (jengelh at medozas dot de)
- Jehan (jehan at girinstud dot io)
@@ -20,11 +26,13 @@ Contributors:
- Mislav Bradac (mislavm at google dot com)
- Nico Weber (thakis at chromium dot org)
- Noel Chromium (noel at chromium dot org)
+- Oliver Wolff (oliver dot wolff at qt dot io)
- Owen Rodley (orodley at google dot com)
- Parag Salasakar (img dot mips1 at gmail dot com)
- Pascal Massimino (pascal dot massimino at gmail dot com)
- Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
- Pierre Joye (pierre dot php at gmail dot com)
+- Roberto Alanis (alanisbaez at google dot com)
- Sam Clegg (sbc at chromium dot org)
- Scott Hancher (seh at google dot com)
- Scott LaVarnway (slavarnway at google dot com)
@@ -38,5 +46,7 @@ Contributors:
- Vikas Arora (vikasa at google dot com)
- Vincent Rabaud (vrabaud at google dot com)
- Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
+- Wan-Teh Chang (wtc at google dot com)
- Yang Zhang (yang dot zhang at arm dot com)
- Yannis Guyon (yguyon at google dot com)
+- Zhi An Ng (zhin at chromium dot org)
diff --git a/media/libwebp/NEWS b/media/libwebp/NEWS
index aa393c819f..5b36c5cf30 100644
--- a/media/libwebp/NEWS
+++ b/media/libwebp/NEWS
@@ -1,3 +1,58 @@
+- 1/11/2022: version 1.2.2
+ This is a binary compatible release.
+ * webpmux: add "-set bgcolor A,R,G,B"
+ * add ARM64 NEON support for MSVC builds (#539)
+ * fix duplicate include error in Xcode when using multiple XCFrameworks in a
+ project (#542)
+ * doc updates and bug fixes (#538, #544, #548, #550)
+
+- 7/20/2021: version 1.2.1
+ This is a binary compatible release.
+ * minor lossless encoder improvements and x86 color conversion speed up
+ * add ARM64 simulator support to xcframeworkbuild.sh (#510)
+ * further security related hardening in libwebp & examples
+ (issues: #497, #508, #518)
+ (chromium: #1196480, #1196773, #1196775, #1196777, #1196778, #1196850)
+ (oss-fuzz: #28658, #28978)
+ * toolchain updates and bug fixes (#498, #501, #502, #504, #505, #506, #509,
+ #533)
+ * use more inclusive language within the source (#507)
+
+- 12/23/2020: version 1.2.0
+ * API changes:
+ - libwebp:
+ encode.h: add a qmin / qmax range for quality factor (cwebp adds -qrange)
+ * lossless encoder improvements
+ * SIMD support for Wasm builds
+ * add xcframeworkbuild.sh, supports Mac Catalyst builds
+ * import fuzzers from oss-fuzz & chromium (#409)
+ * webpmux: add an '-set loop <value>' option (#494)
+ * toolchain updates and bug fixes (#449, #463, #470, #475, #477, #478, #479,
+ #488, #491)
+
+- 12/18/2019: version 1.1.0
+ * API changes:
+ - libwebp:
+ WebPMalloc (issue #442)
+ - extras:
+ WebPUnmultiplyARGB
+ * alpha decode fix (issue #439)
+ * toolchain updates and bug fixes
+ (chromium: #1026858, #1027136, #1027409, #1028620, #1028716, #995200)
+ (oss-fuzz: #19430, #19447)
+
+- 7/4/2019: version 1.0.3
+ This is a binary compatible release.
+ * resize fixes for Nx1 sizes and the addition of non-opaque alpha values for
+ odd sizes (issues #418, #434)
+ * lossless encode/decode performance improvements
+ * lossy compression performance improvement at low quality levels with flat
+ content (issue #432)
+ * python swig files updated to support python 3
+ Tool updates:
+ vwebp will now preserve the aspect ratio of images that exceed monitor
+ resolution by scaling the image to fit (issue #433)
+
- 1/14/2019: version 1.0.2
This is a binary compatible release.
* (Windows) unicode file support in the tools (linux and mac already had
diff --git a/media/libwebp/README b/media/libwebp/README
index 502a4c1c20..f6eaf2c049 100644
--- a/media/libwebp/README
+++ b/media/libwebp/README
@@ -4,7 +4,7 @@
\__\__/\____/\_____/__/ ____ ___
/ _/ / \ \ / _ \/ _/
/ \_/ / / \ \ __/ \__
- \____/____/\_____/_____/____/v1.0.2
+ \____/____/\_____/_____/____/v1.2.2
Description:
============
@@ -13,13 +13,13 @@ WebP codec: library to encode and decode images in WebP format. This package
contains the library that can be used in other programs to add WebP support,
as well as the command line tools 'cwebp' and 'dwebp'.
-See http://developers.google.com/speed/webp
+See https://developers.google.com/speed/webp
The latest source tree is available at
https://chromium.googlesource.com/webm/libwebp
It is released under the same license as the WebM project.
-See http://www.webmproject.org/license/software/ or the
+See https://www.webmproject.org/license/software/ or the
"COPYING" file for details. An additional intellectual
property rights grant can be found in the file PATENTS.
@@ -113,7 +113,7 @@ make install
CMake:
------
-With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp, webpinfo
+With CMake, you can compile libwebp, cwebp, dwebp, gif2webp, img2webp, webpinfo
and the JS bindings.
Prerequisites:
@@ -225,6 +225,7 @@ Usage:
If input size (-s) for an image is not specified, it is
assumed to be a PNG, JPEG, TIFF or WebP file.
+Note: Animated PNG and WebP files are not supported.
Options:
-h / -help ............. short help
@@ -254,6 +255,8 @@ Options:
-partition_limit <int> . limit quality to fit the 512k limit on
the first partition (0=no degradation ... 100=full)
-pass <int> ............ analysis pass number (1..10)
+ -qrange <min> <max> .... specifies the permissible quality range
+ (default: 0 100)
-crop <x> <y> <w> <h> .. crop picture with the given rectangle
-resize <w> <h> ........ resize picture (after any cropping)
-mt .................... use multi-threading if available
@@ -294,6 +297,7 @@ Experimental Options:
-af .................... auto-adjust filter strength
-pre <int> ............. pre-processing filter
+
The main options you might want to try in order to further tune the
visual quality are:
-preset
@@ -341,7 +345,9 @@ The full list of options is available using -h:
> dwebp -h
Usage: dwebp in_file [options] [-o out_file]
-Decodes the WebP image file to PNG format [Default]
+Decodes the WebP image file to PNG format [Default].
+Note: Animated WebP files are not supported.
+
Use following options to convert into alternate image formats:
-pam ......... save the raw RGBA samples as a color PAM
-ppm ......... save the raw RGB samples as a color PPM
@@ -423,15 +429,15 @@ Prerequisites:
1) OpenGL & OpenGL Utility Toolkit (GLUT)
Linux:
$ sudo apt-get install freeglut3-dev mesa-common-dev
- Mac + XCode:
+ Mac + Xcode:
- These libraries should be available in the OpenGL / GLUT frameworks.
Windows:
http://freeglut.sourceforge.net/index.php#download
2) (Optional) qcms (Quick Color Management System)
i. Download qcms from Mozilla / Chromium:
- http://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
- http://src.chromium.org/viewvc/chrome/trunk/src/third_party/qcms
+ https://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
+ https://source.chromium.org/chromium/chromium/src/+/main:third_party/qcms/;drc=d4a2f8e1ed461d8fc05ed88d1ae2dc94c9773825
ii. Build and archive the source files as libqcms.a / qcms.lib
iii. Update makefile.unix / Makefile.vc
a) Define WEBP_HAVE_QCMS
@@ -450,7 +456,7 @@ modes, etc.
Usage:
- img2webp [file-level options] [image files...] [per-frame options...]
+ img2webp [file_options] [[frame_options] frame_file]...
File-level options (only used at the start of compression):
-min_size ............ minimize size
@@ -597,7 +603,7 @@ The encoding flow looks like:
// Setup a config, starting form a preset and tuning some additional
// parameters
WebPConfig config;
- if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor))
+ if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor)) {
return 0; // version error
}
// ... additional tuning
@@ -613,7 +619,7 @@ The encoding flow looks like:
pic.width = width;
pic.height = height;
// allocated picture of dimension width x height
- if (!WebPPictureAllocate(&pic)) {
+ if (!WebPPictureAlloc(&pic)) {
return 0; // memory error
}
// at this point, 'pic' has been initialized as a container,
@@ -780,10 +786,10 @@ Bugs:
Please report all bugs to the issue tracker:
https://bugs.chromium.org/p/webp
Patches welcome! See this page to get started:
- http://www.webmproject.org/code/contribute/submitting-patches/
+ https://www.webmproject.org/code/contribute/submitting-patches/
Discuss:
========
Email: webp-discuss@webmproject.org
-Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
+Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
diff --git a/media/libwebp/README.mux b/media/libwebp/README.mux
index 7e9c3c903b..099d8e061d 100644
--- a/media/libwebp/README.mux
+++ b/media/libwebp/README.mux
@@ -1,7 +1,7 @@
 __ __ ____ ____ ____ __ __ _ __ __
/ \\/ \/ _ \/ _ \/ _ \/ \ \/ \___/_ / _\
\ / __/ _ \ __/ / / (_/ /__
- \__\__/\_____/_____/__/ \__//_/\_____/__/___/v1.0.2
+ \__\__/\_____/_____/__/ \__//_/\_____/__/___/v1.2.2
Description:
@@ -43,10 +43,12 @@ GET_OPTIONS:
frame n get nth frame
SET_OPTIONS:
- Set color profile/metadata:
- icc file.icc set ICC profile
- exif file.exif set EXIF metadata
- xmp file.xmp set XMP metadata
+ Set color profile/metadata/parameters:
+ loop LOOP_COUNT set the loop count
+ bgcolor BACKGROUND_COLOR set the animation background color
+ icc file.icc set ICC profile
+ exif file.exif set EXIF metadata
+ xmp file.xmp set XMP metadata
where: 'file.icc' contains the ICC profile to be set,
'file.exif' contains the EXIF metadata to be set
'file.xmp' contains the XMP metadata to be set
@@ -247,10 +249,10 @@ Bugs:
Please report all bugs to the issue tracker:
https://bugs.chromium.org/p/webp
Patches welcome! See this page to get started:
- http://www.webmproject.org/code/contribute/submitting-patches/
+ https://www.webmproject.org/code/contribute/submitting-patches/
Discuss:
========
Email: webp-discuss@webmproject.org
-Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
+Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
diff --git a/media/libwebp/UXPCHANGES b/media/libwebp/UXPCHANGES
index 78b7823c8d..bf1fe22d6e 100644
--- a/media/libwebp/UXPCHANGES
+++ b/media/libwebp/UXPCHANGES
@@ -3,3 +3,4 @@ Changes made to pristine libwebp source by Moonchild Productions and mozilla.org
2017/01/27 -- Synced with libwebp-0.6.0 (BZ #1294490).
2018/06/29 -- Synced with libwebp-1.0.0 + BUG=webp:381,383,384.
2019/01/21 -- Synced with libwebp-1.0.2
+2022/06/26 -- Synced with libwebp-1.2.2
diff --git a/media/libwebp/dec/alpha_dec.c b/media/libwebp/dec/alpha_dec.c
index 1ff7c62d8b..52c24037e4 100644
--- a/media/libwebp/dec/alpha_dec.c
+++ b/media/libwebp/dec/alpha_dec.c
@@ -183,7 +183,7 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
assert(dec != NULL && io != NULL);
if (row < 0 || num_rows <= 0 || row + num_rows > height) {
- return NULL; // sanity check.
+ return NULL;
}
if (!dec->is_alpha_decoded_) {
diff --git a/media/libwebp/dec/buffer_dec.c b/media/libwebp/dec/buffer_dec.c
index d72d32b0a9..0f3eed2cfe 100644
--- a/media/libwebp/dec/buffer_dec.c
+++ b/media/libwebp/dec/buffer_dec.c
@@ -102,7 +102,7 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
int stride;
uint64_t size;
- if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
+ if ((uint64_t)w * kModeBpp[mode] >= (1ull << 31)) {
return VP8_STATUS_INVALID_PARAM;
}
stride = w * kModeBpp[mode];
@@ -117,7 +117,6 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
}
total_size = size + 2 * uv_size + a_size;
- // Security/sanity checks
output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
if (output == NULL) {
return VP8_STATUS_OUT_OF_MEMORY;
@@ -156,11 +155,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
}
if (WebPIsRGBMode(buffer->colorspace)) {
WebPRGBABuffer* const buf = &buffer->u.RGBA;
- buf->rgba += (buffer->height - 1) * buf->stride;
+ buf->rgba += (int64_t)(buffer->height - 1) * buf->stride;
buf->stride = -buf->stride;
} else {
WebPYUVABuffer* const buf = &buffer->u.YUVA;
- const int H = buffer->height;
+ const int64_t H = buffer->height;
buf->y += (H - 1) * buf->y_stride;
buf->y_stride = -buf->y_stride;
buf->u += ((H - 1) >> 1) * buf->u_stride;
@@ -188,8 +187,7 @@ VP8StatusCode WebPAllocateDecBuffer(int width, int height,
const int ch = options->crop_height;
const int x = options->crop_left & ~1;
const int y = options->crop_top & ~1;
- if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
- x + cw > width || y + ch > height) {
+ if (!WebPCheckCropDimensions(width, height, x, y, cw, ch)) {
return VP8_STATUS_INVALID_PARAM; // out of frame boundary.
}
width = cw;
diff --git a/media/libwebp/dec/frame_dec.c b/media/libwebp/dec/frame_dec.c
index 3d1d662746..d4cdc15344 100644
--- a/media/libwebp/dec/frame_dec.c
+++ b/media/libwebp/dec/frame_dec.c
@@ -705,7 +705,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
+ cache_size + alpha_size + WEBP_ALIGN_CST;
uint8_t* mem;
- if (needed != (size_t)needed) return 0; // check for overflow
+ if (!CheckSizeOverflow(needed)) return 0; // check for overflow
if (needed > dec->mem_size_) {
WebPSafeFree(dec->mem_);
dec->mem_size_ = 0;
@@ -732,7 +732,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
mem += f_info_size;
dec->thread_ctx_.id_ = 0;
dec->thread_ctx_.f_info_ = dec->f_info_;
- if (dec->mt_method_ > 0) {
+ if (dec->filter_type_ > 0 && dec->mt_method_ > 0) {
// secondary cache line. The deblocking process need to make use of the
// filtering strength from previous macroblock row, while the new ones
// are being decoded in parallel. We'll just swap the pointers.
diff --git a/media/libwebp/dec/idec_dec.c b/media/libwebp/dec/idec_dec.c
index ee0d33eac4..3a592d59ed 100644
--- a/media/libwebp/dec/idec_dec.c
+++ b/media/libwebp/dec/idec_dec.c
@@ -166,9 +166,11 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
MemBuffer* const mem = &idec->mem_;
const int need_compressed_alpha = NeedCompressedAlpha(idec);
- const uint8_t* const old_start = mem->buf_ + mem->start_;
+ const uint8_t* const old_start =
+ (mem->buf_ == NULL) ? NULL : mem->buf_ + mem->start_;
const uint8_t* const old_base =
need_compressed_alpha ? dec->alpha_data_ : old_start;
+ assert(mem->buf_ != NULL || mem->start_ == 0);
assert(mem->mode_ == MEM_MODE_APPEND);
if (data_size > MAX_CHUNK_PAYLOAD) {
// security safeguard: trying to allocate more than what the format
@@ -184,7 +186,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
uint8_t* const new_buf =
(uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
if (new_buf == NULL) return 0;
- memcpy(new_buf, old_base, current_size);
+ if (old_base != NULL) memcpy(new_buf, old_base, current_size);
WebPSafeFree(mem->buf_);
mem->buf_ = new_buf;
mem->buf_size_ = (size_t)extra_size;
@@ -192,6 +194,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
mem->end_ = current_size;
}
+ assert(mem->buf_ != NULL);
memcpy(mem->buf_ + mem->end_, data, data_size);
mem->end_ += data_size;
assert(mem->end_ <= mem->buf_size_);
@@ -204,7 +207,9 @@ static int RemapMemBuffer(WebPIDecoder* const idec,
const uint8_t* const data, size_t data_size) {
MemBuffer* const mem = &idec->mem_;
const uint8_t* const old_buf = mem->buf_;
- const uint8_t* const old_start = old_buf + mem->start_;
+ const uint8_t* const old_start =
+ (old_buf == NULL) ? NULL : old_buf + mem->start_;
+ assert(old_buf != NULL || mem->start_ == 0);
assert(mem->mode_ == MEM_MODE_MAP);
if (data_size < mem->buf_size_) return 0; // can't remap to a shorter buffer!
diff --git a/media/libwebp/dec/io_dec.c b/media/libwebp/dec/io_dec.c
index 0edd9f526e..6124c61393 100644
--- a/media/libwebp/dec/io_dec.c
+++ b/media/libwebp/dec/io_dec.c
@@ -25,21 +25,16 @@
static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
WebPDecBuffer* output = p->output;
const WebPYUVABuffer* const buf = &output->u.YUVA;
- uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
- uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
- uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+ uint8_t* const y_dst = buf->y + (size_t)io->mb_y * buf->y_stride;
+ uint8_t* const u_dst = buf->u + (size_t)(io->mb_y >> 1) * buf->u_stride;
+ uint8_t* const v_dst = buf->v + (size_t)(io->mb_y >> 1) * buf->v_stride;
const int mb_w = io->mb_w;
const int mb_h = io->mb_h;
const int uv_w = (mb_w + 1) / 2;
const int uv_h = (mb_h + 1) / 2;
- int j;
- for (j = 0; j < mb_h; ++j) {
- memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
- }
- for (j = 0; j < uv_h; ++j) {
- memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
- memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
- }
+ WebPCopyPlane(io->y, io->y_stride, y_dst, buf->y_stride, mb_w, mb_h);
+ WebPCopyPlane(io->u, io->uv_stride, u_dst, buf->u_stride, uv_w, uv_h);
+ WebPCopyPlane(io->v, io->uv_stride, v_dst, buf->v_stride, uv_w, uv_h);
return io->mb_h;
}
@@ -47,7 +42,7 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
WebPDecBuffer* const output = p->output;
WebPRGBABuffer* const buf = &output->u.RGBA;
- uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+ uint8_t* const dst = buf->rgba + (size_t)io->mb_y * buf->stride;
WebPSamplerProcessPlane(io->y, io->y_stride,
io->u, io->v, io->uv_stride,
dst, buf->stride, io->mb_w, io->mb_h,
@@ -62,7 +57,7 @@ static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
int num_lines_out = io->mb_h; // a priori guess
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
- uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+ uint8_t* dst = buf->rgba + (size_t)io->mb_y * buf->stride;
WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
const uint8_t* cur_y = io->y;
const uint8_t* cur_u = io->u;
@@ -133,7 +128,7 @@ static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
const WebPYUVABuffer* const buf = &p->output->u.YUVA;
const int mb_w = io->mb_w;
const int mb_h = io->mb_h;
- uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+ uint8_t* dst = buf->a + (size_t)io->mb_y * buf->a_stride;
int j;
(void)expected_num_lines_out;
assert(expected_num_lines_out == mb_h);
@@ -186,7 +181,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
(colorspace == MODE_ARGB || colorspace == MODE_Argb);
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
int num_rows;
- const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+ const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
@@ -210,7 +205,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
const WEBP_CSP_MODE colorspace = p->output->colorspace;
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
int num_rows;
- const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+ const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
#if (WEBP_SWAP_16BIT_CSP == 1)
uint8_t* alpha_dst = base_rgba;
@@ -276,9 +271,9 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
int expected_num_lines_out) {
const WebPYUVABuffer* const buf = &p->output->u.YUVA;
- uint8_t* const dst_a = buf->a + p->last_y * buf->a_stride;
+ uint8_t* const dst_a = buf->a + (size_t)p->last_y * buf->a_stride;
if (io->a != NULL) {
- uint8_t* const dst_y = buf->y + p->last_y * buf->y_stride;
+ uint8_t* const dst_y = buf->y + (size_t)p->last_y * buf->y_stride;
const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
assert(expected_num_lines_out == num_lines_out);
if (num_lines_out > 0) { // unmultiply the Y
@@ -303,46 +298,57 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
const int uv_out_height = (out_height + 1) >> 1;
const int uv_in_width = (io->mb_w + 1) >> 1;
const int uv_in_height = (io->mb_h + 1) >> 1;
- const size_t work_size = 2 * out_width; // scratch memory for luma rescaler
+ // scratch memory for luma rescaler
+ const size_t work_size = 2 * (size_t)out_width;
const size_t uv_work_size = 2 * uv_out_width; // and for each u/v ones
- size_t tmp_size, rescaler_size;
+ uint64_t total_size;
+ size_t rescaler_size;
rescaler_t* work;
WebPRescaler* scalers;
const int num_rescalers = has_alpha ? 4 : 3;
- tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
+ total_size = ((uint64_t)work_size + 2 * uv_work_size) * sizeof(*work);
if (has_alpha) {
- tmp_size += work_size * sizeof(*work);
+ total_size += (uint64_t)work_size * sizeof(*work);
}
rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+ total_size += rescaler_size;
+ if (!CheckSizeOverflow(total_size)) {
+ return 0;
+ }
- p->memory = WebPSafeMalloc(1ULL, tmp_size + rescaler_size);
+ p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
if (p->memory == NULL) {
return 0; // memory error
}
work = (rescaler_t*)p->memory;
- scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + tmp_size);
+ scalers = (WebPRescaler*)WEBP_ALIGN(
+ (const uint8_t*)work + total_size - rescaler_size);
p->scaler_y = &scalers[0];
p->scaler_u = &scalers[1];
p->scaler_v = &scalers[2];
p->scaler_a = has_alpha ? &scalers[3] : NULL;
- WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
- buf->y, out_width, out_height, buf->y_stride, 1,
- work);
- WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
- buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
- work + work_size);
- WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
- buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
- work + work_size + uv_work_size);
+ if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+ buf->y, out_width, out_height, buf->y_stride, 1,
+ work) ||
+ !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+ buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
+ work + work_size) ||
+ !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+ buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
+ work + work_size + uv_work_size)) {
+ return 0;
+ }
p->emit = EmitRescaledYUV;
if (has_alpha) {
- WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
- buf->a, out_width, out_height, buf->a_stride, 1,
- work + work_size + 2 * uv_work_size);
+ if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+ buf->a, out_width, out_height, buf->a_stride, 1,
+ work + work_size + 2 * uv_work_size)) {
+ return 0;
+ }
p->emit_alpha = EmitRescaledAlphaYUV;
WebPInitAlphaProcessing();
}
@@ -356,7 +362,7 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
const WebPYUV444Converter convert =
WebPYUV444Converters[p->output->colorspace];
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
- uint8_t* dst = buf->rgba + y_pos * buf->stride;
+ uint8_t* dst = buf->rgba + (size_t)y_pos * buf->stride;
int num_lines_out = 0;
// For RGB rescaling, because of the YUV420, current scan position
// U/V can be +1/-1 line from the Y one. Hence the double test.
@@ -383,15 +389,15 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
while (j < mb_h) {
const int y_lines_in =
WebPRescalerImport(p->scaler_y, mb_h - j,
- io->y + j * io->y_stride, io->y_stride);
+ io->y + (size_t)j * io->y_stride, io->y_stride);
j += y_lines_in;
if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
- const int u_lines_in =
- WebPRescalerImport(p->scaler_u, uv_mb_h - uv_j,
- io->u + uv_j * io->uv_stride, io->uv_stride);
- const int v_lines_in =
- WebPRescalerImport(p->scaler_v, uv_mb_h - uv_j,
- io->v + uv_j * io->uv_stride, io->uv_stride);
+ const int u_lines_in = WebPRescalerImport(
+ p->scaler_u, uv_mb_h - uv_j, io->u + (size_t)uv_j * io->uv_stride,
+ io->uv_stride);
+ const int v_lines_in = WebPRescalerImport(
+ p->scaler_v, uv_mb_h - uv_j, io->v + (size_t)uv_j * io->uv_stride,
+ io->uv_stride);
(void)v_lines_in; // remove a gcc warning
assert(u_lines_in == v_lines_in);
uv_j += u_lines_in;
@@ -403,7 +409,7 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
- uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+ uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
const WEBP_CSP_MODE colorspace = p->output->colorspace;
const int alpha_first =
(colorspace == MODE_ARGB || colorspace == MODE_Argb);
@@ -431,7 +437,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
int max_lines_out) {
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
- uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+ uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
#if (WEBP_SWAP_16BIT_CSP == 1)
uint8_t* alpha_dst = base_rgba;
#else
@@ -470,7 +476,7 @@ static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
int lines_left = expected_num_out_lines;
const int y_end = p->last_y + lines_left;
while (lines_left > 0) {
- const int row_offset = scaler->src_y - io->mb_y;
+ const int64_t row_offset = (int64_t)scaler->src_y - io->mb_y;
WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
io->a + row_offset * io->width, io->width);
lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
@@ -485,51 +491,58 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
const int out_height = io->scaled_height;
const int uv_in_width = (io->mb_w + 1) >> 1;
const int uv_in_height = (io->mb_h + 1) >> 1;
- const size_t work_size = 2 * out_width; // scratch memory for one rescaler
+ // scratch memory for one rescaler
+ const size_t work_size = 2 * (size_t)out_width;
rescaler_t* work; // rescalers work area
uint8_t* tmp; // tmp storage for scaled YUV444 samples before RGB conversion
- size_t tmp_size1, tmp_size2, total_size, rescaler_size;
+ uint64_t tmp_size1, tmp_size2, total_size;
+ size_t rescaler_size;
WebPRescaler* scalers;
const int num_rescalers = has_alpha ? 4 : 3;
- tmp_size1 = 3 * work_size;
- tmp_size2 = 3 * out_width;
- if (has_alpha) {
- tmp_size1 += work_size;
- tmp_size2 += out_width;
- }
+ tmp_size1 = (uint64_t)num_rescalers * work_size;
+ tmp_size2 = (uint64_t)num_rescalers * out_width;
total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+ total_size += rescaler_size;
+ if (!CheckSizeOverflow(total_size)) {
+ return 0;
+ }
- p->memory = WebPSafeMalloc(1ULL, total_size + rescaler_size);
+ p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
if (p->memory == NULL) {
return 0; // memory error
}
work = (rescaler_t*)p->memory;
tmp = (uint8_t*)(work + tmp_size1);
- scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + total_size);
+ scalers = (WebPRescaler*)WEBP_ALIGN(
+ (const uint8_t*)work + total_size - rescaler_size);
p->scaler_y = &scalers[0];
p->scaler_u = &scalers[1];
p->scaler_v = &scalers[2];
p->scaler_a = has_alpha ? &scalers[3] : NULL;
- WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
- tmp + 0 * out_width, out_width, out_height, 0, 1,
- work + 0 * work_size);
- WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
- tmp + 1 * out_width, out_width, out_height, 0, 1,
- work + 1 * work_size);
- WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
- tmp + 2 * out_width, out_width, out_height, 0, 1,
- work + 2 * work_size);
+ if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+ tmp + 0 * out_width, out_width, out_height, 0, 1,
+ work + 0 * work_size) ||
+ !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+ tmp + 1 * out_width, out_width, out_height, 0, 1,
+ work + 1 * work_size) ||
+ !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+ tmp + 2 * out_width, out_width, out_height, 0, 1,
+ work + 2 * work_size)) {
+ return 0;
+ }
p->emit = EmitRescaledRGB;
WebPInitYUV444Converters();
if (has_alpha) {
- WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
- tmp + 3 * out_width, out_width, out_height, 0, 1,
- work + 3 * work_size);
+ if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+ tmp + 3 * out_width, out_width, out_height, 0, 1,
+ work + 3 * work_size)) {
+ return 0;
+ }
p->emit_alpha = EmitRescaledAlphaRGB;
if (p->output->colorspace == MODE_RGBA_4444 ||
p->output->colorspace == MODE_rgbA_4444) {
diff --git a/media/libwebp/dec/quant_dec.c b/media/libwebp/dec/quant_dec.c
index 6ecaf1c453..351da5f561 100644
--- a/media/libwebp/dec/quant_dec.c
+++ b/media/libwebp/dec/quant_dec.c
@@ -61,12 +61,17 @@ static const uint16_t kAcTable[128] = {
void VP8ParseQuant(VP8Decoder* const dec) {
VP8BitReader* const br = &dec->br_;
- const int base_q0 = VP8GetValue(br, 7);
- const int dqy1_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
- const int dqy2_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
- const int dqy2_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
- const int dquv_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
- const int dquv_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+ const int base_q0 = VP8GetValue(br, 7, "global-header");
+ const int dqy1_dc = VP8Get(br, "global-header") ?
+ VP8GetSignedValue(br, 4, "global-header") : 0;
+ const int dqy2_dc = VP8Get(br, "global-header") ?
+ VP8GetSignedValue(br, 4, "global-header") : 0;
+ const int dqy2_ac = VP8Get(br, "global-header") ?
+ VP8GetSignedValue(br, 4, "global-header") : 0;
+ const int dquv_dc = VP8Get(br, "global-header") ?
+ VP8GetSignedValue(br, 4, "global-header") : 0;
+ const int dquv_ac = VP8Get(br, "global-header") ?
+ VP8GetSignedValue(br, 4, "global-header") : 0;
const VP8SegmentHeader* const hdr = &dec->segment_hdr_;
int i;
diff --git a/media/libwebp/dec/tree_dec.c b/media/libwebp/dec/tree_dec.c
index 5818860254..b219cdd2c9 100644
--- a/media/libwebp/dec/tree_dec.c
+++ b/media/libwebp/dec/tree_dec.c
@@ -296,20 +296,21 @@ static void ParseIntraMode(VP8BitReader* const br,
// to decode more than 1 keyframe.
if (dec->segment_hdr_.update_map_) {
// Hardcoded tree parsing
- block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0])
- ? VP8GetBit(br, dec->proba_.segments_[1])
- : 2 + VP8GetBit(br, dec->proba_.segments_[2]);
+ block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0], "segments")
+ ? VP8GetBit(br, dec->proba_.segments_[1], "segments")
+ : VP8GetBit(br, dec->proba_.segments_[2], "segments") + 2;
} else {
block->segment_ = 0; // default for intra
}
- if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_);
+ if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_, "skip");
- block->is_i4x4_ = !VP8GetBit(br, 145); // decide for B_PRED first
+ block->is_i4x4_ = !VP8GetBit(br, 145, "block-size");
if (!block->is_i4x4_) {
// Hardcoded 16x16 intra-mode decision tree.
const int ymode =
- VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
- : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
+ VP8GetBit(br, 156, "pred-modes") ?
+ (VP8GetBit(br, 128, "pred-modes") ? TM_PRED : H_PRED) :
+ (VP8GetBit(br, 163, "pred-modes") ? V_PRED : DC_PRED);
block->imodes_[0] = ymode;
memset(top, ymode, 4 * sizeof(*top));
memset(left, ymode, 4 * sizeof(*left));
@@ -323,22 +324,25 @@ static void ParseIntraMode(VP8BitReader* const br,
const uint8_t* const prob = kBModesProba[top[x]][ymode];
#if (USE_GENERIC_TREE == 1)
// Generic tree-parsing
- int i = kYModesIntra4[VP8GetBit(br, prob[0])];
+ int i = kYModesIntra4[VP8GetBit(br, prob[0], "pred-modes")];
while (i > 0) {
- i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
+ i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i], "pred-modes")];
}
ymode = -i;
#else
// Hardcoded tree parsing
- ymode = !VP8GetBit(br, prob[0]) ? B_DC_PRED :
- !VP8GetBit(br, prob[1]) ? B_TM_PRED :
- !VP8GetBit(br, prob[2]) ? B_VE_PRED :
- !VP8GetBit(br, prob[3]) ?
- (!VP8GetBit(br, prob[4]) ? B_HE_PRED :
- (!VP8GetBit(br, prob[5]) ? B_RD_PRED : B_VR_PRED)) :
- (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
- (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
- (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
+ ymode = !VP8GetBit(br, prob[0], "pred-modes") ? B_DC_PRED :
+ !VP8GetBit(br, prob[1], "pred-modes") ? B_TM_PRED :
+ !VP8GetBit(br, prob[2], "pred-modes") ? B_VE_PRED :
+ !VP8GetBit(br, prob[3], "pred-modes") ?
+ (!VP8GetBit(br, prob[4], "pred-modes") ? B_HE_PRED :
+ (!VP8GetBit(br, prob[5], "pred-modes") ? B_RD_PRED
+ : B_VR_PRED)) :
+ (!VP8GetBit(br, prob[6], "pred-modes") ? B_LD_PRED :
+ (!VP8GetBit(br, prob[7], "pred-modes") ? B_VL_PRED :
+ (!VP8GetBit(br, prob[8], "pred-modes") ? B_HD_PRED
+ : B_HU_PRED))
+ );
#endif // USE_GENERIC_TREE
top[x] = ymode;
}
@@ -348,9 +352,9 @@ static void ParseIntraMode(VP8BitReader* const br,
}
}
// Hardcoded UVMode decision tree
- block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
- : !VP8GetBit(br, 114) ? V_PRED
- : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+ block->uvmode_ = !VP8GetBit(br, 142, "pred-modes-uv") ? DC_PRED
+ : !VP8GetBit(br, 114, "pred-modes-uv") ? V_PRED
+ : VP8GetBit(br, 183, "pred-modes-uv") ? TM_PRED : H_PRED;
}
int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
@@ -514,8 +518,10 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
for (b = 0; b < NUM_BANDS; ++b) {
for (c = 0; c < NUM_CTX; ++c) {
for (p = 0; p < NUM_PROBAS; ++p) {
- const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
- VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
+ const int v =
+ VP8GetBit(br, CoeffsUpdateProba[t][b][c][p], "global-header") ?
+ VP8GetValue(br, 8, "global-header") :
+ CoeffsProba0[t][b][c][p];
proba->bands_[t][b].probas_[c][p] = v;
}
}
@@ -524,9 +530,8 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]];
}
}
- dec->use_skip_proba_ = VP8Get(br);
+ dec->use_skip_proba_ = VP8Get(br, "global-header");
if (dec->use_skip_proba_) {
- dec->skip_p_ = VP8GetValue(br, 8);
+ dec->skip_p_ = VP8GetValue(br, 8, "global-header");
}
}
-
diff --git a/media/libwebp/dec/vp8_dec.c b/media/libwebp/dec/vp8_dec.c
index e7958be6b0..5f51363e53 100644
--- a/media/libwebp/dec/vp8_dec.c
+++ b/media/libwebp/dec/vp8_dec.c
@@ -161,23 +161,26 @@ static int ParseSegmentHeader(VP8BitReader* br,
VP8SegmentHeader* hdr, VP8Proba* proba) {
assert(br != NULL);
assert(hdr != NULL);
- hdr->use_segment_ = VP8Get(br);
+ hdr->use_segment_ = VP8Get(br, "global-header");
if (hdr->use_segment_) {
- hdr->update_map_ = VP8Get(br);
- if (VP8Get(br)) { // update data
+ hdr->update_map_ = VP8Get(br, "global-header");
+ if (VP8Get(br, "global-header")) { // update data
int s;
- hdr->absolute_delta_ = VP8Get(br);
+ hdr->absolute_delta_ = VP8Get(br, "global-header");
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
- hdr->quantizer_[s] = VP8Get(br) ? VP8GetSignedValue(br, 7) : 0;
+ hdr->quantizer_[s] = VP8Get(br, "global-header") ?
+ VP8GetSignedValue(br, 7, "global-header") : 0;
}
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
- hdr->filter_strength_[s] = VP8Get(br) ? VP8GetSignedValue(br, 6) : 0;
+ hdr->filter_strength_[s] = VP8Get(br, "global-header") ?
+ VP8GetSignedValue(br, 6, "global-header") : 0;
}
}
if (hdr->update_map_) {
int s;
for (s = 0; s < MB_FEATURE_TREE_PROBS; ++s) {
- proba->segments_[s] = VP8Get(br) ? VP8GetValue(br, 8) : 255u;
+ proba->segments_[s] = VP8Get(br, "global-header") ?
+ VP8GetValue(br, 8, "global-header") : 255u;
}
}
} else {
@@ -205,7 +208,7 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
size_t last_part;
size_t p;
- dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2)) - 1;
+ dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2, "global-header")) - 1;
last_part = dec->num_parts_minus_one_;
if (size < 3 * last_part) {
// we can't even read the sizes with sz[]! That's a failure.
@@ -229,21 +232,21 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
// Paragraph 9.4
static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
VP8FilterHeader* const hdr = &dec->filter_hdr_;
- hdr->simple_ = VP8Get(br);
- hdr->level_ = VP8GetValue(br, 6);
- hdr->sharpness_ = VP8GetValue(br, 3);
- hdr->use_lf_delta_ = VP8Get(br);
+ hdr->simple_ = VP8Get(br, "global-header");
+ hdr->level_ = VP8GetValue(br, 6, "global-header");
+ hdr->sharpness_ = VP8GetValue(br, 3, "global-header");
+ hdr->use_lf_delta_ = VP8Get(br, "global-header");
if (hdr->use_lf_delta_) {
- if (VP8Get(br)) { // update lf-delta?
+ if (VP8Get(br, "global-header")) { // update lf-delta?
int i;
for (i = 0; i < NUM_REF_LF_DELTAS; ++i) {
- if (VP8Get(br)) {
- hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6);
+ if (VP8Get(br, "global-header")) {
+ hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header");
}
}
for (i = 0; i < NUM_MODE_LF_DELTAS; ++i) {
- if (VP8Get(br)) {
- hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6);
+ if (VP8Get(br, "global-header")) {
+ hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header");
}
}
}
@@ -332,7 +335,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
io->scaled_width = io->width;
io->scaled_height = io->height;
- io->mb_w = io->width; // sanity check
+ io->mb_w = io->width; // for soundness
io->mb_h = io->height; // ditto
VP8ResetProba(&dec->proba_);
@@ -352,8 +355,8 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
buf_size -= frm_hdr->partition_length_;
if (frm_hdr->key_frame_) {
- pic_hdr->colorspace_ = VP8Get(br);
- pic_hdr->clamp_type_ = VP8Get(br);
+ pic_hdr->colorspace_ = VP8Get(br, "global-header");
+ pic_hdr->clamp_type_ = VP8Get(br, "global-header");
}
if (!ParseSegmentHeader(br, &dec->segment_hdr_, &dec->proba_)) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@@ -378,7 +381,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
"Not a key frame.");
}
- VP8Get(br); // ignore the value of update_proba_
+ VP8Get(br, "global-header"); // ignore the value of update_proba_
VP8ParseProba(br, dec);
@@ -400,31 +403,31 @@ static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};
-// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+// See section 13-2: https://datatracker.ietf.org/doc/html/rfc6386#section-13.2
static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
int v;
- if (!VP8GetBit(br, p[3])) {
- if (!VP8GetBit(br, p[4])) {
+ if (!VP8GetBit(br, p[3], "coeffs")) {
+ if (!VP8GetBit(br, p[4], "coeffs")) {
v = 2;
} else {
- v = 3 + VP8GetBit(br, p[5]);
+ v = 3 + VP8GetBit(br, p[5], "coeffs");
}
} else {
- if (!VP8GetBit(br, p[6])) {
- if (!VP8GetBit(br, p[7])) {
- v = 5 + VP8GetBit(br, 159);
+ if (!VP8GetBit(br, p[6], "coeffs")) {
+ if (!VP8GetBit(br, p[7], "coeffs")) {
+ v = 5 + VP8GetBit(br, 159, "coeffs");
} else {
- v = 7 + 2 * VP8GetBit(br, 165);
- v += VP8GetBit(br, 145);
+ v = 7 + 2 * VP8GetBit(br, 165, "coeffs");
+ v += VP8GetBit(br, 145, "coeffs");
}
} else {
const uint8_t* tab;
- const int bit1 = VP8GetBit(br, p[8]);
- const int bit0 = VP8GetBit(br, p[9 + bit1]);
+ const int bit1 = VP8GetBit(br, p[8], "coeffs");
+ const int bit0 = VP8GetBit(br, p[9 + bit1], "coeffs");
const int cat = 2 * bit1 + bit0;
v = 0;
for (tab = kCat3456[cat]; *tab; ++tab) {
- v += v + VP8GetBit(br, *tab);
+ v += v + VP8GetBit(br, *tab, "coeffs");
}
v += 3 + (8 << cat);
}
@@ -438,24 +441,24 @@ static int GetCoeffsFast(VP8BitReader* const br,
int ctx, const quant_t dq, int n, int16_t* out) {
const uint8_t* p = prob[n]->probas_[ctx];
for (; n < 16; ++n) {
- if (!VP8GetBit(br, p[0])) {
+ if (!VP8GetBit(br, p[0], "coeffs")) {
return n; // previous coeff was last non-zero coeff
}
- while (!VP8GetBit(br, p[1])) { // sequence of zero coeffs
+ while (!VP8GetBit(br, p[1], "coeffs")) { // sequence of zero coeffs
p = prob[++n]->probas_[0];
if (n == 16) return 16;
}
{ // non zero coeff
const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
int v;
- if (!VP8GetBit(br, p[2])) {
+ if (!VP8GetBit(br, p[2], "coeffs")) {
v = 1;
p = p_ctx[1];
} else {
v = GetLargeValue(br, p);
p = p_ctx[2];
}
- out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+ out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0];
}
}
return 16;
@@ -468,36 +471,34 @@ static int GetCoeffsAlt(VP8BitReader* const br,
int ctx, const quant_t dq, int n, int16_t* out) {
const uint8_t* p = prob[n]->probas_[ctx];
for (; n < 16; ++n) {
- if (!VP8GetBitAlt(br, p[0])) {
+ if (!VP8GetBitAlt(br, p[0], "coeffs")) {
return n; // previous coeff was last non-zero coeff
}
- while (!VP8GetBitAlt(br, p[1])) { // sequence of zero coeffs
+ while (!VP8GetBitAlt(br, p[1], "coeffs")) { // sequence of zero coeffs
p = prob[++n]->probas_[0];
if (n == 16) return 16;
}
{ // non zero coeff
const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
int v;
- if (!VP8GetBitAlt(br, p[2])) {
+ if (!VP8GetBitAlt(br, p[2], "coeffs")) {
v = 1;
p = p_ctx[1];
} else {
v = GetLargeValue(br, p);
p = p_ctx[2];
}
- out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+ out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0];
}
}
return 16;
}
-static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) {
- if (GetCoeffs == NULL) {
- if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
- GetCoeffs = GetCoeffsAlt;
- } else {
- GetCoeffs = GetCoeffsFast;
- }
+WEBP_DSP_INIT_FUNC(InitGetCoeffs) {
+ if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
+ GetCoeffs = GetCoeffsAlt;
+ } else {
+ GetCoeffs = GetCoeffsFast;
}
}
diff --git a/media/libwebp/dec/vp8i_dec.h b/media/libwebp/dec/vp8i_dec.h
index fabee44a0b..31d9080ca1 100644
--- a/media/libwebp/dec/vp8i_dec.h
+++ b/media/libwebp/dec/vp8i_dec.h
@@ -31,7 +31,7 @@ extern "C" {
// version numbers
#define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 0
+#define DEC_MIN_VERSION 2
#define DEC_REV_VERSION 2
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
diff --git a/media/libwebp/dec/vp8l_dec.c b/media/libwebp/dec/vp8l_dec.c
index 0502cb9a52..32371a67fe 100644
--- a/media/libwebp/dec/vp8l_dec.c
+++ b/media/libwebp/dec/vp8l_dec.c
@@ -84,7 +84,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
// to 256 (green component values) + 24 (length prefix values)
// + color_cache_size (between 0 and 2048).
// All values computed for 8-bit first level lookup with Mark Adler's tool:
-// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
+// https://github.com/madler/zlib/blob/v1.2.5/examples/enough.c
#define FIXED_TABLE_SIZE (630 * 3 + 410)
static const uint16_t kTableSize[12] = {
FIXED_TABLE_SIZE + 654,
@@ -362,12 +362,8 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
VP8LMetadata* const hdr = &dec->hdr_;
uint32_t* huffman_image = NULL;
HTreeGroup* htree_groups = NULL;
- // When reading htrees, some might be unused, as the format allows it.
- // We will still read them but put them in this htree_group_bogus.
- HTreeGroup htree_group_bogus;
HuffmanCode* huffman_tables = NULL;
- HuffmanCode* huffman_tables_bogus = NULL;
- HuffmanCode* next = NULL;
+ HuffmanCode* huffman_table = NULL;
int num_htree_groups = 1;
int num_htree_groups_max = 1;
int max_alphabet_size = 0;
@@ -418,12 +414,6 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
if (*mapped_group == -1) *mapped_group = num_htree_groups++;
huffman_image[i] = *mapped_group;
}
- huffman_tables_bogus = (HuffmanCode*)WebPSafeMalloc(
- table_size, sizeof(*huffman_tables_bogus));
- if (huffman_tables_bogus == NULL) {
- dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
- goto Error;
- }
} else {
num_htree_groups = num_htree_groups_max;
}
@@ -453,63 +443,71 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
goto Error;
}
- next = huffman_tables;
+ huffman_table = huffman_tables;
for (i = 0; i < num_htree_groups_max; ++i) {
- // If the index "i" is unused in the Huffman image, read the coefficients
- // but store them to a bogus htree_group.
- const int is_bogus = (mapping != NULL && mapping[i] == -1);
- HTreeGroup* const htree_group =
- is_bogus ? &htree_group_bogus :
- &htree_groups[(mapping == NULL) ? i : mapping[i]];
- HuffmanCode** const htrees = htree_group->htrees;
- HuffmanCode* huffman_tables_i = is_bogus ? huffman_tables_bogus : next;
- int size;
- int total_size = 0;
- int is_trivial_literal = 1;
- int max_bits = 0;
- for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
- int alphabet_size = kAlphabetSize[j];
- htrees[j] = huffman_tables_i;
- if (j == 0 && color_cache_bits > 0) {
- alphabet_size += 1 << color_cache_bits;
- }
- size =
- ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_tables_i);
- if (size == 0) {
- goto Error;
- }
- if (is_trivial_literal && kLiteralMap[j] == 1) {
- is_trivial_literal = (huffman_tables_i->bits == 0);
+ // If the index "i" is unused in the Huffman image, just make sure the
+ // coefficients are valid but do not store them.
+ if (mapping != NULL && mapping[i] == -1) {
+ for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+ int alphabet_size = kAlphabetSize[j];
+ if (j == 0 && color_cache_bits > 0) {
+ alphabet_size += (1 << color_cache_bits);
+ }
+ // Passing in NULL so that nothing gets filled.
+ if (!ReadHuffmanCode(alphabet_size, dec, code_lengths, NULL)) {
+ goto Error;
+ }
}
- total_size += huffman_tables_i->bits;
- huffman_tables_i += size;
- if (j <= ALPHA) {
- int local_max_bits = code_lengths[0];
- int k;
- for (k = 1; k < alphabet_size; ++k) {
- if (code_lengths[k] > local_max_bits) {
- local_max_bits = code_lengths[k];
+ } else {
+ HTreeGroup* const htree_group =
+ &htree_groups[(mapping == NULL) ? i : mapping[i]];
+ HuffmanCode** const htrees = htree_group->htrees;
+ int size;
+ int total_size = 0;
+ int is_trivial_literal = 1;
+ int max_bits = 0;
+ for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+ int alphabet_size = kAlphabetSize[j];
+ htrees[j] = huffman_table;
+ if (j == 0 && color_cache_bits > 0) {
+ alphabet_size += (1 << color_cache_bits);
+ }
+ size = ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_table);
+ if (size == 0) {
+ goto Error;
+ }
+ if (is_trivial_literal && kLiteralMap[j] == 1) {
+ is_trivial_literal = (huffman_table->bits == 0);
+ }
+ total_size += huffman_table->bits;
+ huffman_table += size;
+ if (j <= ALPHA) {
+ int local_max_bits = code_lengths[0];
+ int k;
+ for (k = 1; k < alphabet_size; ++k) {
+ if (code_lengths[k] > local_max_bits) {
+ local_max_bits = code_lengths[k];
+ }
}
+ max_bits += local_max_bits;
}
- max_bits += local_max_bits;
}
- }
- if (!is_bogus) next = huffman_tables_i;
- htree_group->is_trivial_literal = is_trivial_literal;
- htree_group->is_trivial_code = 0;
- if (is_trivial_literal) {
- const int red = htrees[RED][0].value;
- const int blue = htrees[BLUE][0].value;
- const int alpha = htrees[ALPHA][0].value;
- htree_group->literal_arb = ((uint32_t)alpha << 24) | (red << 16) | blue;
- if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
- htree_group->is_trivial_code = 1;
- htree_group->literal_arb |= htrees[GREEN][0].value << 8;
+ htree_group->is_trivial_literal = is_trivial_literal;
+ htree_group->is_trivial_code = 0;
+ if (is_trivial_literal) {
+ const int red = htrees[RED][0].value;
+ const int blue = htrees[BLUE][0].value;
+ const int alpha = htrees[ALPHA][0].value;
+ htree_group->literal_arb = ((uint32_t)alpha << 24) | (red << 16) | blue;
+ if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
+ htree_group->is_trivial_code = 1;
+ htree_group->literal_arb |= htrees[GREEN][0].value << 8;
+ }
}
+ htree_group->use_packed_table =
+ !htree_group->is_trivial_code && (max_bits < HUFFMAN_PACKED_BITS);
+ if (htree_group->use_packed_table) BuildPackedTable(htree_group);
}
- htree_group->use_packed_table =
- !htree_group->is_trivial_code && (max_bits < HUFFMAN_PACKED_BITS);
- if (htree_group->use_packed_table) BuildPackedTable(htree_group);
}
ok = 1;
@@ -521,7 +519,6 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
Error:
WebPSafeFree(code_lengths);
- WebPSafeFree(huffman_tables_bogus);
WebPSafeFree(mapping);
if (!ok) {
WebPSafeFree(huffman_image);
@@ -562,8 +559,11 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
memory += work_size * sizeof(*work);
scaled_data = (uint32_t*)memory;
- WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
- out_width, out_height, 0, num_channels, work);
+ if (!WebPRescalerInit(dec->rescaler, in_width, in_height,
+ (uint8_t*)scaled_data, out_width, out_height,
+ 0, num_channels, work)) {
+ return 0;
+ }
return 1;
}
#endif // WEBP_REDUCE_SIZE
@@ -577,13 +577,14 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
int rgba_stride, uint8_t* const rgba) {
uint32_t* const src = (uint32_t*)rescaler->dst;
+ uint8_t* dst = rgba;
const int dst_width = rescaler->dst_width;
int num_lines_out = 0;
while (WebPRescalerHasPendingOutput(rescaler)) {
- uint8_t* const dst = rgba + num_lines_out * rgba_stride;
WebPRescalerExportRow(rescaler);
WebPMultARGBRow(src, dst_width, 1);
VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
+ dst += rgba_stride;
++num_lines_out;
}
return num_lines_out;
@@ -597,8 +598,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
int num_lines_in = 0;
int num_lines_out = 0;
while (num_lines_in < mb_h) {
- uint8_t* const row_in = in + num_lines_in * in_stride;
- uint8_t* const row_out = out + num_lines_out * out_stride;
+ uint8_t* const row_in = in + (uint64_t)num_lines_in * in_stride;
+ uint8_t* const row_out = out + (uint64_t)num_lines_out * out_stride;
const int lines_left = mb_h - num_lines_in;
const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
int lines_imported;
@@ -757,11 +758,11 @@ static WEBP_INLINE HTreeGroup* GetHtreeGroupForPos(VP8LMetadata* const hdr,
typedef void (*ProcessRowsFunc)(VP8LDecoder* const dec, int row);
-static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
+static void ApplyInverseTransforms(VP8LDecoder* const dec,
+ int start_row, int num_rows,
const uint32_t* const rows) {
int n = dec->next_transform_;
const int cache_pixs = dec->width_ * num_rows;
- const int start_row = dec->last_row_;
const int end_row = start_row + num_rows;
const uint32_t* rows_in = rows;
uint32_t* const rows_out = dec->argb_cache_;
@@ -792,15 +793,15 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
VP8Io* const io = dec->io_;
uint8_t* rows_data = (uint8_t*)dec->argb_cache_;
const int in_stride = io->width * sizeof(uint32_t); // in unit of RGBA
-
- ApplyInverseTransforms(dec, num_rows, rows);
+ ApplyInverseTransforms(dec, dec->last_row_, num_rows, rows);
if (!SetCropWindow(io, dec->last_row_, row, &rows_data, in_stride)) {
// Nothing to output (this time).
} else {
const WebPDecBuffer* const output = dec->output_;
if (WebPIsRGBMode(output->colorspace)) { // convert to RGBA
const WebPRGBABuffer* const buf = &output->u.RGBA;
- uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
+ uint8_t* const rgba =
+ buf->rgba + (int64_t)dec->last_out_row_ * buf->stride;
const int num_rows_out =
#if !defined(WEBP_REDUCE_SIZE)
io->use_scaling ?
@@ -951,7 +952,6 @@ static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
break;
default:
goto Copy;
- break;
}
CopySmallPattern8b(src, dst, length, pattern);
return;
@@ -1196,6 +1196,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
VP8LFillBitWindow(br);
dist_code = GetCopyDistance(dist_symbol, br);
dist = PlaneCodeToDistance(width, dist_code);
+
if (VP8LIsEndOfStream(br)) break;
if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
goto Error;
@@ -1518,7 +1519,7 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
assert(dec->width_ <= final_width);
dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
if (dec->pixels_ == NULL) {
- dec->argb_cache_ = NULL; // for sanity check
+ dec->argb_cache_ = NULL; // for soundness
dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
return 0;
}
@@ -1528,7 +1529,7 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
- dec->argb_cache_ = NULL; // for sanity check
+ dec->argb_cache_ = NULL; // for soundness
dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
if (dec->pixels_ == NULL) {
dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
@@ -1556,7 +1557,7 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int last_row) {
const int cache_pixs = width * num_rows_to_process;
uint8_t* const dst = output + width * cur_row;
const uint32_t* const src = dec->argb_cache_;
- ApplyInverseTransforms(dec, num_rows_to_process, in);
+ ApplyInverseTransforms(dec, cur_row, num_rows_to_process, in);
WebPExtractGreen(src, dst, cache_pixs);
AlphaApplyFilter(alph_dec,
cur_row, cur_row + num_rows_to_process, dst, width);
@@ -1670,7 +1671,6 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
VP8Io* io = NULL;
WebPDecParams* params = NULL;
- // Sanity checks.
if (dec == NULL) return 0;
assert(dec->hdr_.huffman_tables_ != NULL);
diff --git a/media/libwebp/dec/vp8li_dec.h b/media/libwebp/dec/vp8li_dec.h
index 2b9c95a44b..8df713beb8 100644
--- a/media/libwebp/dec/vp8li_dec.h
+++ b/media/libwebp/dec/vp8li_dec.h
@@ -37,7 +37,7 @@ struct VP8LTransform {
int bits_; // subsampling bits defining transform window.
int xsize_; // transform window X index.
int ysize_; // transform window Y index.
- uint32_t *data_; // transform data.
+ uint32_t* data_; // transform data.
};
typedef struct {
@@ -48,23 +48,23 @@ typedef struct {
int huffman_mask_;
int huffman_subsample_bits_;
int huffman_xsize_;
- uint32_t *huffman_image_;
+ uint32_t* huffman_image_;
int num_htree_groups_;
- HTreeGroup *htree_groups_;
- HuffmanCode *huffman_tables_;
+ HTreeGroup* htree_groups_;
+ HuffmanCode* huffman_tables_;
} VP8LMetadata;
typedef struct VP8LDecoder VP8LDecoder;
struct VP8LDecoder {
VP8StatusCode status_;
VP8LDecodeState state_;
- VP8Io *io_;
+ VP8Io* io_;
- const WebPDecBuffer *output_; // shortcut to io->opaque->output
+ const WebPDecBuffer* output_; // shortcut to io->opaque->output
- uint32_t *pixels_; // Internal data: either uint8_t* for alpha
+ uint32_t* pixels_; // Internal data: either uint8_t* for alpha
// or uint32_t* for BGRA.
- uint32_t *argb_cache_; // Scratch buffer for temporary BGRA storage.
+ uint32_t* argb_cache_; // Scratch buffer for temporary BGRA storage.
VP8LBitReader br_;
int incremental_; // if true, incremental decoding is expected
@@ -86,8 +86,8 @@ struct VP8LDecoder {
// or'd bitset storing the transforms types.
uint32_t transforms_seen_;
- uint8_t *rescaler_memory; // Working memory for rescaling work.
- WebPRescaler *rescaler; // Common rescaler for all channels.
+ uint8_t* rescaler_memory; // Working memory for rescaling work.
+ WebPRescaler* rescaler; // Common rescaler for all channels.
};
//------------------------------------------------------------------------------
diff --git a/media/libwebp/dec/webp_dec.c b/media/libwebp/dec/webp_dec.c
index 89c264d0a0..6857960774 100644
--- a/media/libwebp/dec/webp_dec.c
+++ b/media/libwebp/dec/webp_dec.c
@@ -785,6 +785,13 @@ VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
//------------------------------------------------------------------------------
// Cropping and rescaling.
+int WebPCheckCropDimensions(int image_width, int image_height,
+ int x, int y, int w, int h) {
+ return !(x < 0 || y < 0 || w <= 0 || h <= 0 ||
+ x >= image_width || w > image_width || w > image_width - x ||
+ y >= image_height || h > image_height || h > image_height - y);
+}
+
int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
const int W = io->width;
@@ -792,7 +799,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
int x = 0, y = 0, w = W, h = H;
// Cropping
- io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+ io->use_cropping = (options != NULL) && options->use_cropping;
if (io->use_cropping) {
w = options->crop_width;
h = options->crop_height;
@@ -802,7 +809,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
x &= ~1;
y &= ~1;
}
- if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+ if (!WebPCheckCropDimensions(W, H, x, y, w, h)) {
return 0; // out of frame boundary error
}
}
@@ -814,7 +821,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
io->mb_h = h;
// Scaling
- io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+ io->use_scaling = (options != NULL) && options->use_scaling;
if (io->use_scaling) {
int scaled_width = options->scaled_width;
int scaled_height = options->scaled_height;
@@ -835,8 +842,8 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
if (io->use_scaling) {
// disable filter (only for large downscaling ratio).
- io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
- (io->scaled_height < H * 3 / 4);
+ io->bypass_filtering |= (io->scaled_width < W * 3 / 4) &&
+ (io->scaled_height < H * 3 / 4);
io->fancy_upsampling = 0;
}
return 1;
diff --git a/media/libwebp/dec/webpi_dec.h b/media/libwebp/dec/webpi_dec.h
index 83d7444e51..a1b7c83fcd 100644
--- a/media/libwebp/dec/webpi_dec.h
+++ b/media/libwebp/dec/webpi_dec.h
@@ -77,6 +77,10 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
//------------------------------------------------------------------------------
// Misc utils
+// Returns true if crop dimensions are within image bounds.
+int WebPCheckCropDimensions(int image_width, int image_height,
+ int x, int y, int w, int h);
+
// Initializes VP8Io with custom setup, io and teardown functions. The default
// hooks will use the supplied 'params' as io->opaque handle.
void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
diff --git a/media/libwebp/demux/demux.c b/media/libwebp/demux/demux.c
index 2034024d06..13953b1c54 100644
--- a/media/libwebp/demux/demux.c
+++ b/media/libwebp/demux/demux.c
@@ -24,7 +24,7 @@
#include "../webp/format_constants.h"
#define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 0
+#define DMUX_MIN_VERSION 2
#define DMUX_REV_VERSION 2
typedef struct {
@@ -221,12 +221,16 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
const size_t chunk_start_offset = mem->start_;
const uint32_t fourcc = ReadLE32(mem);
const uint32_t payload_size = ReadLE32(mem);
- const uint32_t payload_size_padded = payload_size + (payload_size & 1);
- const size_t payload_available = (payload_size_padded > MemDataSize(mem))
- ? MemDataSize(mem) : payload_size_padded;
- const size_t chunk_size = CHUNK_HEADER_SIZE + payload_available;
+ uint32_t payload_size_padded;
+ size_t payload_available;
+ size_t chunk_size;
if (payload_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+ payload_size_padded = payload_size + (payload_size & 1);
+ payload_available = (payload_size_padded > MemDataSize(mem))
+ ? MemDataSize(mem) : payload_size_padded;
+ chunk_size = CHUNK_HEADER_SIZE + payload_available;
if (SizeIsInvalid(mem, payload_size_padded)) return PARSE_ERROR;
if (payload_size_padded > MemDataSize(mem)) status = PARSE_NEED_MORE_DATA;
@@ -312,6 +316,7 @@ static ParseStatus ParseAnimationFrame(
int bits;
MemBuffer* const mem = &dmux->mem_;
Frame* frame;
+ size_t start_offset;
ParseStatus status =
NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
if (status != PARSE_OK) return status;
@@ -332,7 +337,11 @@ static ParseStatus ParseAnimationFrame(
// Store a frame only if the animation flag is set there is some data for
// this frame is available.
+ start_offset = mem->start_;
status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
+ if (status != PARSE_ERROR && mem->start_ - start_offset > anmf_payload_size) {
+ status = PARSE_ERROR;
+ }
if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
added_frame = AddFrame(dmux, frame);
if (added_frame) {
@@ -446,9 +455,11 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
const size_t chunk_start_offset = mem->start_;
const uint32_t fourcc = ReadLE32(mem);
const uint32_t chunk_size = ReadLE32(mem);
- const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
+ uint32_t chunk_size_padded;
if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+ chunk_size_padded = chunk_size + (chunk_size & 1);
if (SizeIsInvalid(mem, chunk_size_padded)) return PARSE_ERROR;
switch (fourcc) {
diff --git a/media/libwebp/dsp/alpha_processing.c b/media/libwebp/dsp/alpha_processing.c
index 6ff1352ae2..8c5e90210f 100644
--- a/media/libwebp/dsp/alpha_processing.c
+++ b/media/libwebp/dsp/alpha_processing.c
@@ -157,7 +157,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
}
}
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
@@ -178,7 +179,8 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
#undef MFIX
void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
-void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse);
//------------------------------------------------------------------------------
@@ -193,8 +195,8 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
}
}
-void WebPMultRows(uint8_t* ptr, int stride,
- const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+ const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
int width, int num_rows, int inverse) {
int n;
for (n = 0; n < num_rows; ++n) {
@@ -290,9 +292,9 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
}
#if !WEBP_NEON_OMIT_C_CODE
-static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
+static int DispatchAlpha_C(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
int width, int height,
- uint8_t* dst, int dst_stride) {
+ uint8_t* WEBP_RESTRICT dst, int dst_stride) {
uint32_t alpha_mask = 0xff;
int i, j;
@@ -309,9 +311,10 @@ static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xff);
}
-static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_C(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint32_t* WEBP_RESTRICT dst,
+ int dst_stride) {
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
@@ -322,9 +325,9 @@ static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
}
}
-static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_C(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
int width, int height,
- uint8_t* alpha, int alpha_stride) {
+ uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
uint8_t alpha_mask = 0xff;
int i, j;
@@ -340,7 +343,8 @@ static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
return (alpha_mask == 0xff);
}
-static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
+static void ExtractGreen_C(const uint32_t* WEBP_RESTRICT argb,
+ uint8_t* WEBP_RESTRICT alpha, int size) {
int i;
for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
}
@@ -359,6 +363,11 @@ static int HasAlpha32b_C(const uint8_t* src, int length) {
return 0;
}
+static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) {
+ int x;
+ for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color;
+}
+
//------------------------------------------------------------------------------
// Simple channel manipulations.
@@ -367,8 +376,11 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
}
#ifdef WORDS_BIGENDIAN
-static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
- const uint8_t* b, int len, uint32_t* out) {
+static void PackARGB_C(const uint8_t* WEBP_RESTRICT a,
+ const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, uint32_t* WEBP_RESTRICT out) {
int i;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
@@ -376,8 +388,10 @@ static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
}
#endif
-static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
- int len, int step, uint32_t* out) {
+static void PackRGB_C(const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, int step, uint32_t* WEBP_RESTRICT out) {
int i, offset = 0;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
@@ -387,19 +401,26 @@ static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
-int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
-int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+ uint8_t* WEBP_RESTRICT, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT, int, int, int,
+ uint32_t* WEBP_RESTRICT, int);
+int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+ uint8_t* WEBP_RESTRICT, int);
+void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+ uint8_t* WEBP_RESTRICT alpha, int size);
#ifdef WORDS_BIGENDIAN
void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int, uint32_t*);
#endif
-void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
- int len, int step, uint32_t* out);
+void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, int step, uint32_t* WEBP_RESTRICT out);
int (*WebPHasAlpha8b)(const uint8_t* src, int length);
int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
//------------------------------------------------------------------------------
// Init function
@@ -428,13 +449,14 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
WebPHasAlpha8b = HasAlpha8b_C;
WebPHasAlpha32b = HasAlpha32b_C;
+ WebPAlphaReplace = AlphaReplace_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitAlphaProcessingSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitAlphaProcessingSSE41();
}
@@ -448,7 +470,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitAlphaProcessingNEON();
@@ -469,4 +491,5 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
assert(WebPPackRGB != NULL);
assert(WebPHasAlpha8b != NULL);
assert(WebPHasAlpha32b != NULL);
+ assert(WebPAlphaReplace != NULL);
}
diff --git a/media/libwebp/dsp/alpha_processing_mips_dsp_r2.c b/media/libwebp/dsp/alpha_processing_mips_dsp_r2.c
new file mode 100644
index 0000000000..ab597e68bb
--- /dev/null
+++ b/media/libwebp/dsp/alpha_processing_mips_dsp_r2.c
@@ -0,0 +1,228 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+// Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
+ int width, int height,
+ uint8_t* dst, int dst_stride) {
+ uint32_t alpha_mask = 0xffffffff;
+ int i, j, temp0;
+
+ for (j = 0; j < height; ++j) {
+ uint8_t* pdst = dst;
+ const uint8_t* palpha = alpha;
+ for (i = 0; i < (width >> 2); ++i) {
+ int temp1, temp2, temp3;
+
+ __asm__ volatile (
+ "ulw %[temp0], 0(%[palpha]) \n\t"
+ "addiu %[palpha], %[palpha], 4 \n\t"
+ "addiu %[pdst], %[pdst], 16 \n\t"
+ "srl %[temp1], %[temp0], 8 \n\t"
+ "srl %[temp2], %[temp0], 16 \n\t"
+ "srl %[temp3], %[temp0], 24 \n\t"
+ "and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
+ "sb %[temp0], -16(%[pdst]) \n\t"
+ "sb %[temp1], -12(%[pdst]) \n\t"
+ "sb %[temp2], -8(%[pdst]) \n\t"
+ "sb %[temp3], -4(%[pdst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+ [alpha_mask]"+r"(alpha_mask)
+ :
+ : "memory"
+ );
+ }
+
+ for (i = 0; i < (width & 3); ++i) {
+ __asm__ volatile (
+ "lbu %[temp0], 0(%[palpha]) \n\t"
+ "addiu %[palpha], %[palpha], 1 \n\t"
+ "sb %[temp0], 0(%[pdst]) \n\t"
+ "and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
+ "addiu %[pdst], %[pdst], 4 \n\t"
+ : [temp0]"=&r"(temp0), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+ [alpha_mask]"+r"(alpha_mask)
+ :
+ : "memory"
+ );
+ }
+ alpha += alpha_stride;
+ dst += dst_stride;
+ }
+
+ __asm__ volatile (
+ "ext %[temp0], %[alpha_mask], 0, 16 \n\t"
+ "srl %[alpha_mask], %[alpha_mask], 16 \n\t"
+ "and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
+ "ext %[temp0], %[alpha_mask], 0, 8 \n\t"
+ "srl %[alpha_mask], %[alpha_mask], 8 \n\t"
+ "and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
+ : [temp0]"=&r"(temp0), [alpha_mask]"+r"(alpha_mask)
+ :
+ );
+
+ return (alpha_mask != 0xff);
+}
+
+static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
+ int inverse) {
+ int x;
+ const uint32_t c_00ffffff = 0x00ffffffu;
+ const uint32_t c_ff000000 = 0xff000000u;
+ const uint32_t c_8000000 = 0x00800000u;
+ const uint32_t c_8000080 = 0x00800080u;
+ for (x = 0; x < width; ++x) {
+ const uint32_t argb = ptr[x];
+ if (argb < 0xff000000u) { // alpha < 255
+ if (argb <= 0x00ffffffu) { // alpha == 0
+ ptr[x] = 0;
+ } else {
+ int temp0, temp1, temp2, temp3, alpha;
+ __asm__ volatile (
+ "srl %[alpha], %[argb], 24 \n\t"
+ "replv.qb %[temp0], %[alpha] \n\t"
+ "and %[temp0], %[temp0], %[c_00ffffff] \n\t"
+ "beqz %[inverse], 0f \n\t"
+ "divu $zero, %[c_ff000000], %[alpha] \n\t"
+ "mflo %[temp0] \n\t"
+ "0: \n\t"
+ "andi %[temp1], %[argb], 0xff \n\t"
+ "ext %[temp2], %[argb], 8, 8 \n\t"
+ "ext %[temp3], %[argb], 16, 8 \n\t"
+ "mul %[temp1], %[temp1], %[temp0] \n\t"
+ "mul %[temp2], %[temp2], %[temp0] \n\t"
+ "mul %[temp3], %[temp3], %[temp0] \n\t"
+ "precrq.ph.w %[temp1], %[temp2], %[temp1] \n\t"
+ "addu %[temp3], %[temp3], %[c_8000000] \n\t"
+ "addu %[temp1], %[temp1], %[c_8000080] \n\t"
+ "precrq.ph.w %[temp3], %[argb], %[temp3] \n\t"
+ "precrq.qb.ph %[temp1], %[temp3], %[temp1] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [alpha]"=&r"(alpha)
+ : [inverse]"r"(inverse), [c_00ffffff]"r"(c_00ffffff),
+ [c_8000000]"r"(c_8000000), [c_8000080]"r"(c_8000080),
+ [c_ff000000]"r"(c_ff000000), [argb]"r"(argb)
+ : "memory", "hi", "lo"
+ );
+ ptr[x] = temp1;
+ }
+ }
+ }
+}
+
+#ifdef WORDS_BIGENDIAN
+static void PackARGB_MIPSdspR2(const uint8_t* a, const uint8_t* r,
+ const uint8_t* g, const uint8_t* b, int len,
+ uint32_t* out) {
+ int temp0, temp1, temp2, temp3, offset;
+ const int rest = len & 1;
+ const uint32_t* const loop_end = out + len - rest;
+ const int step = 4;
+ __asm__ volatile (
+ "xor %[offset], %[offset], %[offset] \n\t"
+ "beq %[loop_end], %[out], 0f \n\t"
+ "2: \n\t"
+ "lbux %[temp0], %[offset](%[a]) \n\t"
+ "lbux %[temp1], %[offset](%[r]) \n\t"
+ "lbux %[temp2], %[offset](%[g]) \n\t"
+ "lbux %[temp3], %[offset](%[b]) \n\t"
+ "ins %[temp1], %[temp0], 16, 16 \n\t"
+ "ins %[temp3], %[temp2], 16, 16 \n\t"
+ "addiu %[out], %[out], 4 \n\t"
+ "precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
+ "sw %[temp0], -4(%[out]) \n\t"
+ "addu %[offset], %[offset], %[step] \n\t"
+ "bne %[loop_end], %[out], 2b \n\t"
+ "0: \n\t"
+ "beq %[rest], $zero, 1f \n\t"
+ "lbux %[temp0], %[offset](%[a]) \n\t"
+ "lbux %[temp1], %[offset](%[r]) \n\t"
+ "lbux %[temp2], %[offset](%[g]) \n\t"
+ "lbux %[temp3], %[offset](%[b]) \n\t"
+ "ins %[temp1], %[temp0], 16, 16 \n\t"
+ "ins %[temp3], %[temp2], 16, 16 \n\t"
+ "precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
+ "sw %[temp0], 0(%[out]) \n\t"
+ "1: \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
+ : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+ [loop_end]"r"(loop_end), [rest]"r"(rest)
+ : "memory"
+ );
+}
+#endif // WORDS_BIGENDIAN
+
+static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
+ const uint8_t* b, int len, int step,
+ uint32_t* out) {
+ int temp0, temp1, temp2, offset;
+ const int rest = len & 1;
+ const int a = 0xff;
+ const uint32_t* const loop_end = out + len - rest;
+ __asm__ volatile (
+ "xor %[offset], %[offset], %[offset] \n\t"
+ "beq %[loop_end], %[out], 0f \n\t"
+ "2: \n\t"
+ "lbux %[temp0], %[offset](%[r]) \n\t"
+ "lbux %[temp1], %[offset](%[g]) \n\t"
+ "lbux %[temp2], %[offset](%[b]) \n\t"
+ "ins %[temp0], %[a], 16, 16 \n\t"
+ "ins %[temp2], %[temp1], 16, 16 \n\t"
+ "addiu %[out], %[out], 4 \n\t"
+ "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
+ "sw %[temp0], -4(%[out]) \n\t"
+ "addu %[offset], %[offset], %[step] \n\t"
+ "bne %[loop_end], %[out], 2b \n\t"
+ "0: \n\t"
+ "beq %[rest], $zero, 1f \n\t"
+ "lbux %[temp0], %[offset](%[r]) \n\t"
+ "lbux %[temp1], %[offset](%[g]) \n\t"
+ "lbux %[temp2], %[offset](%[b]) \n\t"
+ "ins %[temp0], %[a], 16, 16 \n\t"
+ "ins %[temp2], %[temp1], 16, 16 \n\t"
+ "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
+ "sw %[temp0], 0(%[out]) \n\t"
+ "1: \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [offset]"=&r"(offset), [out]"+&r"(out)
+ : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+ [loop_end]"r"(loop_end), [rest]"r"(rest)
+ : "memory"
+ );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitAlphaProcessingMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
+ WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
+ WebPMultARGBRow = MultARGBRow_MIPSdspR2;
+#ifdef WORDS_BIGENDIAN
+ WebPPackARGB = PackARGB_MIPSdspR2;
+#endif
+ WebPPackRGB = PackRGB_MIPSdspR2;
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/alpha_processing_neon.c b/media/libwebp/dsp/alpha_processing_neon.c
index 53dfce2b36..c900279a35 100644
--- a/media/libwebp/dsp/alpha_processing_neon.c
+++ b/media/libwebp/dsp/alpha_processing_neon.c
@@ -80,9 +80,9 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
//------------------------------------------------------------------------------
-static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT dst, int dst_stride) {
uint32_t alpha_mask = 0xffffffffu;
uint8x8_t mask8 = vdup_n_u8(0xff);
uint32_t tmp[2];
@@ -112,9 +112,10 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xffffffffu);
}
-static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint32_t* WEBP_RESTRICT dst,
+ int dst_stride) {
int i, j;
uint8x8x4_t greens; // leave A/R/B channels zero'd.
greens.val[0] = vdup_n_u8(0);
@@ -131,9 +132,9 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
}
}
-static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
int width, int height,
- uint8_t* alpha, int alpha_stride) {
+ uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
uint32_t alpha_mask = 0xffffffffu;
uint8x8_t mask8 = vdup_n_u8(0xff);
uint32_t tmp[2];
@@ -161,8 +162,8 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
return (alpha_mask == 0xffffffffu);
}
-static void ExtractGreen_NEON(const uint32_t* argb,
- uint8_t* alpha, int size) {
+static void ExtractGreen_NEON(const uint32_t* WEBP_RESTRICT argb,
+ uint8_t* WEBP_RESTRICT alpha, int size) {
int i;
for (i = 0; i + 16 <= size; i += 16) {
const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
diff --git a/media/libwebp/dsp/alpha_processing_sse2.c b/media/libwebp/dsp/alpha_processing_sse2.c
index 9a3bc4485a..56d9ee5e98 100644
--- a/media/libwebp/dsp/alpha_processing_sse2.c
+++ b/media/libwebp/dsp/alpha_processing_sse2.c
@@ -18,9 +18,9 @@
//------------------------------------------------------------------------------
-static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT dst, int dst_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@@ -72,9 +72,10 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
return (alpha_and != 0xff);
}
-static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint32_t* WEBP_RESTRICT dst,
+ int dst_stride) {
int i, j;
const __m128i zero = _mm_setzero_si128();
const int limit = width & ~15;
@@ -98,9 +99,9 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
}
}
-static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
int width, int height,
- uint8_t* alpha, int alpha_stride) {
+ uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@@ -214,7 +215,7 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
// Alpha detection
static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
- const __m128i all_0xff = _mm_set1_epi8(0xff);
+ const __m128i all_0xff = _mm_set1_epi8((char)0xff);
int i = 0;
for (; i + 16 <= length; i += 16) {
const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
@@ -228,7 +229,7 @@ static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
const __m128i alpha_mask = _mm_set1_epi32(0xff);
- const __m128i all_0xff = _mm_set1_epi8(0xff);
+ const __m128i all_0xff = _mm_set1_epi8((char)0xff);
int i = 0;
// We don't know if we can access the last 3 bytes after the last alpha
// value 'src[4 * length - 4]' (because we don't know if alpha is the first
@@ -265,6 +266,27 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
return 0;
}
+static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
+ const __m128i m_color = _mm_set1_epi32(color);
+ const __m128i zero = _mm_setzero_si128();
+ int i = 0;
+ for (; i + 8 <= length; i += 8) {
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4));
+ const __m128i b0 = _mm_srai_epi32(a0, 24);
+ const __m128i b1 = _mm_srai_epi32(a1, 24);
+ const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
+ const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
+ const __m128i d0 = _mm_and_si128(c0, m_color);
+ const __m128i d1 = _mm_and_si128(c1, m_color);
+ const __m128i e0 = _mm_andnot_si128(c0, a0);
+ const __m128i e1 = _mm_andnot_si128(c1, a1);
+ _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0));
+ _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1));
+ }
+ for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color;
+}
+
// -----------------------------------------------------------------------------
// Apply alpha value to rows
@@ -296,7 +318,8 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
}
-static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
+static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse) {
int x = 0;
if (!inverse) {
@@ -334,6 +357,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
WebPHasAlpha8b = HasAlpha8b_SSE2;
WebPHasAlpha32b = HasAlpha32b_SSE2;
+ WebPAlphaReplace = AlphaReplace_SSE2;
}
#else // !WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/alpha_processing_sse41.c b/media/libwebp/dsp/alpha_processing_sse41.c
index e33c1aba4d..307d200f1f 100644
--- a/media/libwebp/dsp/alpha_processing_sse41.c
+++ b/media/libwebp/dsp/alpha_processing_sse41.c
@@ -19,9 +19,9 @@
//------------------------------------------------------------------------------
-static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
- int width, int height,
- uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
+ int argb_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
diff --git a/media/libwebp/dsp/cost.c b/media/libwebp/dsp/cost.c
new file mode 100644
index 0000000000..bf112c7f0c
--- /dev/null
+++ b/media/libwebp/dsp/cost.c
@@ -0,0 +1,411 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+#include "../enc/cost_enc.h"
+
+//------------------------------------------------------------------------------
+// Boolean-cost cost table
+
+const uint16_t VP8EntropyCost[256] = {
+ 1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
+ 1178, 1152, 1110, 1076, 1061, 1024, 1024, 992, 968, 951,
+ 939, 911, 896, 878, 871, 854, 838, 820, 811, 794,
+ 786, 768, 768, 752, 740, 732, 720, 709, 704, 690,
+ 683, 672, 666, 655, 647, 640, 631, 622, 615, 607,
+ 598, 592, 586, 576, 572, 564, 559, 555, 547, 541,
+ 534, 528, 522, 512, 512, 504, 500, 494, 488, 483,
+ 477, 473, 467, 461, 458, 452, 448, 443, 438, 434,
+ 427, 424, 419, 415, 410, 406, 403, 399, 394, 390,
+ 384, 384, 377, 374, 370, 366, 362, 359, 355, 351,
+ 347, 342, 342, 336, 333, 330, 326, 323, 320, 316,
+ 312, 308, 305, 302, 299, 296, 293, 288, 287, 283,
+ 280, 277, 274, 272, 268, 266, 262, 256, 256, 256,
+ 251, 248, 245, 242, 240, 237, 234, 232, 228, 226,
+ 223, 221, 218, 216, 214, 211, 208, 205, 203, 201,
+ 198, 196, 192, 191, 188, 187, 183, 181, 179, 176,
+ 175, 171, 171, 168, 165, 163, 160, 159, 156, 154,
+ 152, 150, 148, 146, 144, 142, 139, 138, 135, 133,
+ 131, 128, 128, 125, 123, 121, 119, 117, 115, 113,
+ 111, 110, 107, 105, 103, 102, 100, 98, 96, 94,
+ 92, 91, 89, 86, 86, 83, 82, 80, 77, 76,
+ 74, 73, 71, 69, 67, 66, 64, 63, 61, 59,
+ 57, 55, 54, 52, 51, 49, 47, 46, 44, 43,
+ 41, 40, 38, 36, 35, 33, 32, 30, 29, 27,
+ 25, 24, 22, 21, 19, 18, 16, 15, 13, 12,
+ 10, 9, 7, 6, 4, 3
+};
+
+//------------------------------------------------------------------------------
+// Level cost tables
+
+// fixed costs for coding levels, deduce from the coding tree.
+// This is only the part that doesn't depend on the probability state.
+const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
+ 0, 256, 256, 256, 256, 432, 618, 630,
+ 731, 640, 640, 828, 901, 948, 1021, 1101,
+ 1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
+ 1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
+ 1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
+ 1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
+ 1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
+ 1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
+ 1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
+ 1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
+ 1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
+ 1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
+ 1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
+ 2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
+ 2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
+ 2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
+ 2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
+ 2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
+ 2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
+ 2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
+ 2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
+ 2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
+ 2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
+ 2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
+ 2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
+ 2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
+ 2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
+ 2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
+ 2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
+ 2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
+ 2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
+ 3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
+ 3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
+ 3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
+ 3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
+ 3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
+ 3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
+ 3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
+ 3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
+ 3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
+ 3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
+ 2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
+ 2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
+ 3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
+ 3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
+ 3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
+ 3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
+ 3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
+ 3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
+ 3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
+ 3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
+ 3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
+ 3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
+ 3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
+ 3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
+ 3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
+ 3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
+ 3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
+ 3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
+ 3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
+ 3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
+ 3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
+ 4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
+ 4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
+ 4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
+ 4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
+ 4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
+ 4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
+ 4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
+ 4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
+ 4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
+ 4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
+ 4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
+ 3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+ 3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+ 3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+ 3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+ 3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+ 3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+ 4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+ 4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+ 3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+ 4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+ 4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+ 4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+ 4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+ 4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+ 4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+ 4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+ 4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+ 4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+ 4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+ 4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+ 4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+ 4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+ 4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+ 4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+ 4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+ 4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+ 4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+ 5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+ 5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+ 5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+ 5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+ 5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+ 4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+ 4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+ 4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+ 4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+ 4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+ 5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+ 5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+ 5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+ 5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+ 5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+ 5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+ 5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+ 5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+ 5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+ 5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+ 5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+ 5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+ 5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+ 5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+ 5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+ 5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+ 5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+ 5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+ 5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+ 5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+ 5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+ 6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+ 6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+ 6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+ 6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+ 6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+ 6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
+ 3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+ 3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+ 3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+ 3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+ 3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+ 3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+ 4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+ 4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+ 3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+ 4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+ 4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+ 4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+ 4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+ 4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+ 4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+ 4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+ 4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+ 4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+ 4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+ 4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+ 4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+ 4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+ 4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+ 4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+ 4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+ 4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+ 4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+ 5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+ 5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+ 5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+ 5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+ 5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+ 4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+ 4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+ 4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+ 4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+ 4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+ 5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+ 5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+ 5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+ 5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+ 5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+ 5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+ 5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+ 5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+ 5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+ 5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+ 5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+ 5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+ 5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+ 5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+ 5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+ 5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+ 5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+ 5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+ 5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+ 5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+ 5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+ 6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+ 6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+ 6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+ 6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+ 6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+ 6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
+ 5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
+ 5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
+ 5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
+ 5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
+ 5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
+ 5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
+ 5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
+ 5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
+ 5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
+ 5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
+ 5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
+ 6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
+ 6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
+ 6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
+ 6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
+ 6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
+ 6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
+ 6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
+ 6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
+ 6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
+ 6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
+ 6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
+ 6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
+ 6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
+ 6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
+ 6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
+ 6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
+ 6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
+ 6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
+ 6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
+ 7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
+ 7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
+ 6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
+ 6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
+ 6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
+ 6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
+ 6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
+ 6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
+ 6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
+ 6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
+ 6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
+ 6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
+ 7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
+ 7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
+ 7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
+ 7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
+ 7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
+ 7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
+ 7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
+ 7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
+ 7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
+ 7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
+ 7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
+ 7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
+ 7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
+};
+
+//------------------------------------------------------------------------------
+// Tables for level coding
+
+const uint8_t VP8EncBands[16 + 1] = {
+ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+ 0 // sentinel
+};
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
+ int n = res->first;
+ // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+ const int p0 = res->prob[n][ctx0][0];
+ CostArrayPtr const costs = res->costs;
+ const uint16_t* t = costs[n][ctx0];
+ // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+ // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+ // be missing during the loop.
+ int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+ if (res->last < 0) {
+ return VP8BitCost(0, p0);
+ }
+ for (; n < res->last; ++n) {
+ const int v = abs(res->coeffs[n]);
+ const int ctx = (v >= 2) ? 2 : v;
+ cost += VP8LevelCost(t, v);
+ t = costs[n + 1][ctx];
+ }
+ // Last coefficient is always non-zero
+ {
+ const int v = abs(res->coeffs[n]);
+ assert(v != 0);
+ cost += VP8LevelCost(t, v);
+ if (n < 15) {
+ const int b = VP8EncBands[n + 1];
+ const int ctx = (v == 1) ? 1 : 2;
+ const int last_p0 = res->prob[b][ctx][0];
+ cost += VP8BitCost(0, last_p0);
+ }
+ }
+ return cost;
+}
+
+static void SetResidualCoeffs_C(const int16_t* const coeffs,
+ VP8Residual* const res) {
+ int n;
+ res->last = -1;
+ assert(res->first == 0 || coeffs[0] == 0);
+ for (n = 15; n >= 0; --n) {
+ if (coeffs[n]) {
+ res->last = n;
+ break;
+ }
+ }
+ res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+VP8GetResidualCostFunc VP8GetResidualCost;
+VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+extern void VP8EncDspCostInitMIPS32(void);
+extern void VP8EncDspCostInitMIPSdspR2(void);
+extern void VP8EncDspCostInitSSE2(void);
+extern void VP8EncDspCostInitNEON(void);
+
+WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
+ VP8GetResidualCost = GetResidualCost_C;
+ VP8SetResidualCoeffs = SetResidualCoeffs_C;
+
+ // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+ if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+ if (VP8GetCPUInfo(kMIPS32)) {
+ VP8EncDspCostInitMIPS32();
+ }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ VP8EncDspCostInitMIPSdspR2();
+ }
+#endif
+#if defined(WEBP_HAVE_SSE2)
+ if (VP8GetCPUInfo(kSSE2)) {
+ VP8EncDspCostInitSSE2();
+ }
+#endif
+#if defined(WEBP_HAVE_NEON)
+ if (VP8GetCPUInfo(kNEON)) {
+ VP8EncDspCostInitNEON();
+ }
+#endif
+ }
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/dsp/cost_mips32.c b/media/libwebp/dsp/cost_mips32.c
new file mode 100644
index 0000000000..4e97e8a756
--- /dev/null
+++ b/media/libwebp/dsp/cost_mips32.c
@@ -0,0 +1,154 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../enc/cost_enc.h"
+
+static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
+ int temp0, temp1;
+ int v_reg, ctx_reg;
+ int n = res->first;
+ // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+ int p0 = res->prob[n][ctx0][0];
+ CostArrayPtr const costs = res->costs;
+ const uint16_t* t = costs[n][ctx0];
+ // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+ // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+ // be missing during the loop.
+ int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+ const int16_t* res_coeffs = res->coeffs;
+ const int res_last = res->last;
+ const int const_max_level = MAX_VARIABLE_LEVEL;
+ const int const_2 = 2;
+ const uint16_t** p_costs = &costs[n][0];
+ const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+ if (res->last < 0) {
+ return VP8BitCost(0, p0);
+ }
+
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "subu %[temp1], %[res_last], %[n] \n\t"
+ "sll %[temp0], %[n], 1 \n\t"
+ "blez %[temp1], 2f \n\t"
+ " addu %[res_coeffs], %[res_coeffs], %[temp0] \n\t"
+ "1: \n\t"
+ "lh %[v_reg], 0(%[res_coeffs]) \n\t"
+ "addiu %[n], %[n], 1 \n\t"
+ "negu %[temp0], %[v_reg] \n\t"
+ "slti %[temp1], %[v_reg], 0 \n\t"
+ "movn %[v_reg], %[temp0], %[temp1] \n\t"
+ "sltiu %[temp0], %[v_reg], 2 \n\t"
+ "move %[ctx_reg], %[v_reg] \n\t"
+ "movz %[ctx_reg], %[const_2], %[temp0] \n\t"
+ "sll %[temp1], %[v_reg], 1 \n\t"
+ "addu %[temp1], %[temp1], %[VP8LevelFixedCosts] \n\t"
+ "lhu %[temp1], 0(%[temp1]) \n\t"
+ "slt %[temp0], %[v_reg], %[const_max_level] \n\t"
+ "movz %[v_reg], %[const_max_level], %[temp0] \n\t"
+ "addu %[cost], %[cost], %[temp1] \n\t"
+ "sll %[v_reg], %[v_reg], 1 \n\t"
+ "sll %[ctx_reg], %[ctx_reg], 2 \n\t"
+ "addu %[v_reg], %[v_reg], %[t] \n\t"
+ "lhu %[temp0], 0(%[v_reg]) \n\t"
+ "addu %[p_costs], %[p_costs], %[inc_p_costs] \n\t"
+ "addu %[t], %[p_costs], %[ctx_reg] \n\t"
+ "addu %[cost], %[cost], %[temp0] \n\t"
+ "addiu %[res_coeffs], %[res_coeffs], 2 \n\t"
+ "bne %[n], %[res_last], 1b \n\t"
+ " lw %[t], 0(%[t]) \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+ [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+ [temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
+ : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+ [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+ [inc_p_costs]"r"(inc_p_costs)
+ : "memory"
+ );
+
+ // Last coefficient is always non-zero
+ {
+ const int v = abs(res->coeffs[n]);
+ assert(v != 0);
+ cost += VP8LevelCost(t, v);
+ if (n < 15) {
+ const int b = VP8EncBands[n + 1];
+ const int ctx = (v == 1) ? 1 : 2;
+ const int last_p0 = res->prob[b][ctx][0];
+ cost += VP8BitCost(0, last_p0);
+ }
+ }
+ return cost;
+}
+
+static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
+ VP8Residual* const res) {
+ const int16_t* p_coeffs = (int16_t*)coeffs;
+ int temp0, temp1, temp2, n, n1;
+ assert(res->first == 0 || coeffs[0] == 0);
+
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "addiu %[p_coeffs], %[p_coeffs], 28 \n\t"
+ "li %[n], 15 \n\t"
+ "li %[temp2], -1 \n\t"
+ "0: \n\t"
+ "ulw %[temp0], 0(%[p_coeffs]) \n\t"
+ "beqz %[temp0], 1f \n\t"
+#if defined(WORDS_BIGENDIAN)
+ " sll %[temp1], %[temp0], 16 \n\t"
+#else
+ " srl %[temp1], %[temp0], 16 \n\t"
+#endif
+ "addiu %[n1], %[n], -1 \n\t"
+ "movz %[temp0], %[n1], %[temp1] \n\t"
+ "movn %[temp0], %[n], %[temp1] \n\t"
+ "j 2f \n\t"
+ " addiu %[temp2], %[temp0], 0 \n\t"
+ "1: \n\t"
+ "addiu %[n], %[n], -2 \n\t"
+ "bgtz %[n], 0b \n\t"
+ " addiu %[p_coeffs], %[p_coeffs], -4 \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [p_coeffs]"+&r"(p_coeffs), [temp0]"=&r"(temp0),
+ [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [n]"=&r"(n), [n1]"=&r"(n1)
+ :
+ : "memory"
+ );
+ res->last = temp2;
+ res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
+ VP8GetResidualCost = GetResidualCost_MIPS32;
+ VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
+}
+
+#else // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPS32)
+
+#endif // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/cost_mips_dsp_r2.c b/media/libwebp/dsp/cost_mips_dsp_r2.c
new file mode 100644
index 0000000000..e9ee99f6ac
--- /dev/null
+++ b/media/libwebp/dsp/cost_mips_dsp_r2.c
@@ -0,0 +1,107 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../enc/cost_enc.h"
+
+static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
+ int temp0, temp1;
+ int v_reg, ctx_reg;
+ int n = res->first;
+ // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+ int p0 = res->prob[n][ctx0][0];
+ CostArrayPtr const costs = res->costs;
+ const uint16_t* t = costs[n][ctx0];
+ // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+ // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+ // be missing during the loop.
+ int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+ const int16_t* res_coeffs = res->coeffs;
+ const int res_last = res->last;
+ const int const_max_level = MAX_VARIABLE_LEVEL;
+ const int const_2 = 2;
+ const uint16_t** p_costs = &costs[n][0];
+ const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+ if (res->last < 0) {
+ return VP8BitCost(0, p0);
+ }
+
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "subu %[temp1], %[res_last], %[n] \n\t"
+ "blez %[temp1], 2f \n\t"
+ " nop \n\t"
+ "1: \n\t"
+ "sll %[temp0], %[n], 1 \n\t"
+ "lhx %[v_reg], %[temp0](%[res_coeffs]) \n\t"
+ "addiu %[n], %[n], 1 \n\t"
+ "absq_s.w %[v_reg], %[v_reg] \n\t"
+ "sltiu %[temp0], %[v_reg], 2 \n\t"
+ "move %[ctx_reg], %[v_reg] \n\t"
+ "movz %[ctx_reg], %[const_2], %[temp0] \n\t"
+ "sll %[temp1], %[v_reg], 1 \n\t"
+ "lhx %[temp1], %[temp1](%[VP8LevelFixedCosts]) \n\t"
+ "slt %[temp0], %[v_reg], %[const_max_level] \n\t"
+ "movz %[v_reg], %[const_max_level], %[temp0] \n\t"
+ "addu %[cost], %[cost], %[temp1] \n\t"
+ "sll %[v_reg], %[v_reg], 1 \n\t"
+ "sll %[ctx_reg], %[ctx_reg], 2 \n\t"
+ "lhx %[temp0], %[v_reg](%[t]) \n\t"
+ "addu %[p_costs], %[p_costs], %[inc_p_costs] \n\t"
+ "addu %[t], %[p_costs], %[ctx_reg] \n\t"
+ "addu %[cost], %[cost], %[temp0] \n\t"
+ "bne %[n], %[res_last], 1b \n\t"
+ " lw %[t], 0(%[t]) \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+ [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+ [temp1]"=&r"(temp1)
+ : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+ [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+ [res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
+ : "memory"
+ );
+
+ // Last coefficient is always non-zero
+ {
+ const int v = abs(res->coeffs[n]);
+ assert(v != 0);
+ cost += VP8LevelCost(t, v);
+ if (n < 15) {
+ const int b = VP8EncBands[n + 1];
+ const int ctx = (v == 1) ? 1 : 2;
+ const int last_p0 = res->prob[b][ctx][0];
+ cost += VP8BitCost(0, last_p0);
+ }
+ }
+ return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
+ VP8GetResidualCost = GetResidualCost_MIPSdspR2;
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/cost_neon.c b/media/libwebp/dsp/cost_neon.c
new file mode 100644
index 0000000000..78f715ff27
--- /dev/null
+++ b/media/libwebp/dsp/cost_neon.c
@@ -0,0 +1,122 @@
+// Copyright 2018 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of cost functions
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include "../dsp/neon.h"
+#include "../enc/cost_enc.h"
+
+static const uint8_t position[16] = { 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16 };
+
+static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
+ VP8Residual* const res) {
+ const int16x8_t minus_one = vdupq_n_s16(-1);
+ const int16x8_t coeffs_0 = vld1q_s16(coeffs);
+ const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);
+ const uint16x8_t eob_0 = vtstq_s16(coeffs_0, minus_one);
+ const uint16x8_t eob_1 = vtstq_s16(coeffs_1, minus_one);
+ const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1));
+ const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position));
+
+#ifdef __aarch64__
+ res->last = vmaxvq_u8(masked) - 1;
+#else
+ const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked));
+ const uint16x8_t eob_16x8 = vmovl_u8(eob_8x8);
+ const uint16x4_t eob_16x4 =
+ vmax_u16(vget_low_u16(eob_16x8), vget_high_u16(eob_16x8));
+ const uint32x4_t eob_32x4 = vmovl_u16(eob_16x4);
+ uint32x2_t eob_32x2 =
+ vmax_u32(vget_low_u32(eob_32x4), vget_high_u32(eob_32x4));
+ eob_32x2 = vpmax_u32(eob_32x2, eob_32x2);
+
+ vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0);
+ --res->last;
+#endif // __aarch64__
+
+ res->coeffs = coeffs;
+}
+
+static int GetResidualCost_NEON(int ctx0, const VP8Residual* const res) {
+ uint8_t levels[16], ctxs[16];
+ uint16_t abs_levels[16];
+ int n = res->first;
+ // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+ const int p0 = res->prob[n][ctx0][0];
+ CostArrayPtr const costs = res->costs;
+ const uint16_t* t = costs[n][ctx0];
+ // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+ // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+ // be missing during the loop.
+ int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+ if (res->last < 0) {
+ return VP8BitCost(0, p0);
+ }
+
+ { // precompute clamped levels and contexts, packed to 8b.
+ const uint8x16_t kCst2 = vdupq_n_u8(2);
+ const uint8x16_t kCst67 = vdupq_n_u8(MAX_VARIABLE_LEVEL);
+ const int16x8_t c0 = vld1q_s16(res->coeffs);
+ const int16x8_t c1 = vld1q_s16(res->coeffs + 8);
+ const uint16x8_t E0 = vreinterpretq_u16_s16(vabsq_s16(c0));
+ const uint16x8_t E1 = vreinterpretq_u16_s16(vabsq_s16(c1));
+ const uint8x16_t F = vcombine_u8(vqmovn_u16(E0), vqmovn_u16(E1));
+ const uint8x16_t G = vminq_u8(F, kCst2); // context = 0,1,2
+ const uint8x16_t H = vminq_u8(F, kCst67); // clamp_level in [0..67]
+
+ vst1q_u8(ctxs, G);
+ vst1q_u8(levels, H);
+
+ vst1q_u16(abs_levels, E0);
+ vst1q_u16(abs_levels + 8, E1);
+ }
+ for (; n < res->last; ++n) {
+ const int ctx = ctxs[n];
+ const int level = levels[n];
+ const int flevel = abs_levels[n]; // full level
+ cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost()
+ t = costs[n + 1][ctx];
+ }
+ // Last coefficient is always non-zero
+ {
+ const int level = levels[n];
+ const int flevel = abs_levels[n];
+ assert(flevel != 0);
+ cost += VP8LevelFixedCosts[flevel] + t[level];
+ if (n < 15) {
+ const int b = VP8EncBands[n + 1];
+ const int ctx = ctxs[n];
+ const int last_p0 = res->prob[b][ctx][0];
+ cost += VP8BitCost(0, last_p0);
+ }
+ }
+ return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitNEON(void) {
+ VP8SetResidualCoeffs = SetResidualCoeffs_NEON;
+ VP8GetResidualCost = GetResidualCost_NEON;
+}
+
+#else // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitNEON)
+
+#endif // WEBP_USE_NEON
diff --git a/media/libwebp/dsp/cost_sse2.c b/media/libwebp/dsp/cost_sse2.c
new file mode 100644
index 0000000000..8cfe4e0091
--- /dev/null
+++ b/media/libwebp/dsp/cost_sse2.c
@@ -0,0 +1,119 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of cost functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+
+static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
+ VP8Residual* const res) {
+ const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
+ const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+ // Use SSE2 to compare 16 values with a single instruction.
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i m0 = _mm_packs_epi16(c0, c1);
+ const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
+ // Get the comparison results as a bitmask into 16bits. Negate the mask to get
+ // the position of entries that are not equal to zero. We don't need to mask
+ // out least significant bits according to res->first, since coeffs[0] is 0
+ // if res->first > 0.
+ const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
+ // The position of the most significant non-zero bit indicates the position of
+ // the last non-zero value.
+ assert(res->first == 0 || coeffs[0] == 0);
+ res->last = mask ? BitsLog2Floor(mask) : -1;
+ res->coeffs = coeffs;
+}
+
+static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
+ uint8_t levels[16], ctxs[16];
+ uint16_t abs_levels[16];
+ int n = res->first;
+ // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+ const int p0 = res->prob[n][ctx0][0];
+ CostArrayPtr const costs = res->costs;
+ const uint16_t* t = costs[n][ctx0];
+ // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+ // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+ // be missing during the loop.
+ int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+ if (res->last < 0) {
+ return VP8BitCost(0, p0);
+ }
+
+ { // precompute clamped levels and contexts, packed to 8b.
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i kCst2 = _mm_set1_epi8(2);
+ const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
+ const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
+ const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
+ const __m128i D0 = _mm_sub_epi16(zero, c0);
+ const __m128i D1 = _mm_sub_epi16(zero, c1);
+ const __m128i E0 = _mm_max_epi16(c0, D0); // abs(v), 16b
+ const __m128i E1 = _mm_max_epi16(c1, D1);
+ const __m128i F = _mm_packs_epi16(E0, E1);
+ const __m128i G = _mm_min_epu8(F, kCst2); // context = 0,1,2
+ const __m128i H = _mm_min_epu8(F, kCst67); // clamp_level in [0..67]
+
+ _mm_storeu_si128((__m128i*)&ctxs[0], G);
+ _mm_storeu_si128((__m128i*)&levels[0], H);
+
+ _mm_storeu_si128((__m128i*)&abs_levels[0], E0);
+ _mm_storeu_si128((__m128i*)&abs_levels[8], E1);
+ }
+ for (; n < res->last; ++n) {
+ const int ctx = ctxs[n];
+ const int level = levels[n];
+ const int flevel = abs_levels[n]; // full level
+ cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost()
+ t = costs[n + 1][ctx];
+ }
+ // Last coefficient is always non-zero
+ {
+ const int level = levels[n];
+ const int flevel = abs_levels[n];
+ assert(flevel != 0);
+ cost += VP8LevelFixedCosts[flevel] + t[level];
+ if (n < 15) {
+ const int b = VP8EncBands[n + 1];
+ const int ctx = ctxs[n];
+ const int last_p0 = res->prob[b][ctx][0];
+ cost += VP8BitCost(0, last_p0);
+ }
+ }
+ return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
+ VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
+ VP8GetResidualCost = GetResidualCost_SSE2;
+}
+
+#else // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitSSE2)
+
+#endif // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/cpu.c b/media/libwebp/dsp/cpu.c
new file mode 100644
index 0000000000..ff57d90224
--- /dev/null
+++ b/media/libwebp/dsp/cpu.c
@@ -0,0 +1,253 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// CPU detection
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_HAVE_NEON_RTCD)
+#include <stdio.h>
+#include <string.h>
+#endif
+
+#if defined(WEBP_ANDROID_NEON)
+#include <cpu-features.h>
+#endif
+
+//------------------------------------------------------------------------------
+// SSE2 detection.
+//
+
+// apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+ __asm__ volatile (
+ "mov %%ebx, %%edi\n"
+ "cpuid\n"
+ "xchg %%edi, %%ebx\n"
+ : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+ : "a"(info_type), "c"(0));
+}
+#elif defined(__x86_64__) && \
+ (defined(__code_model_medium__) || defined(__code_model_large__)) && \
+ defined(__PIC__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+ __asm__ volatile (
+ "xchg{q}\t{%%rbx}, %q1\n"
+ "cpuid\n"
+ "xchg{q}\t{%%rbx}, %q1\n"
+ : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]),
+ "=d"(cpu_info[3])
+ : "a"(info_type), "c"(0));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+ __asm__ volatile (
+ "cpuid\n"
+ : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+ : "a"(info_type), "c"(0));
+}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+
+#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // >= VS2008 SP1
+#include <intrin.h>
+#define GetCPUInfo(info, type) __cpuidex(info, type, 0) // set ecx=0
+#define WEBP_HAVE_MSC_CPUID
+#elif _MSC_VER > 1310
+#include <intrin.h>
+#define GetCPUInfo __cpuid
+#define WEBP_HAVE_MSC_CPUID
+#endif
+
+#endif
+
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static WEBP_INLINE uint64_t xgetbv(void) {
+ const uint32_t ecx = 0;
+ uint32_t eax, edx;
+ // Use the raw opcode for xgetbv for compatibility with older toolchains.
+ __asm__ volatile (
+ ".byte 0x0f, 0x01, 0xd0\n"
+ : "=a"(eax), "=d"(edx) : "c" (ecx));
+ return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+ defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219 // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static WEBP_INLINE uint64_t xgetbv(void) {
+ uint32_t eax_, edx_;
+ __asm {
+ xor ecx, ecx // ecx = 0
+ // Use the raw opcode for xgetbv for compatibility with older toolchains.
+ __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+ mov eax_, eax
+ mov edx_, edx
+ }
+ return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID)
+
+// helper function for run-time detection of slow SSSE3 platforms
+static int CheckSlowModel(int info) {
+ // Table listing display models with longer latencies for the bsr instruction
+ // (ie 2 cycles vs 10/16 cycles) and some SSSE3 instructions like pshufb.
+ // Refer to Intel 64 and IA-32 Architectures Optimization Reference Manual.
+ static const uint8_t kSlowModels[] = {
+ 0x37, 0x4a, 0x4d, // Silvermont Microarchitecture
+ 0x1c, 0x26, 0x27 // Atom Microarchitecture
+ };
+ const uint32_t model = ((info & 0xf0000) >> 12) | ((info >> 4) & 0xf);
+ const uint32_t family = (info >> 8) & 0xf;
+ if (family == 0x06) {
+ size_t i;
+ for (i = 0; i < sizeof(kSlowModels) / sizeof(kSlowModels[0]); ++i) {
+ if (model == kSlowModels[i]) return 1;
+ }
+ }
+ return 0;
+}
+
+static int x86CPUInfo(CPUFeature feature) {
+ int max_cpuid_value;
+ int cpu_info[4];
+ int is_intel = 0;
+
+ // get the highest feature value cpuid supports
+ GetCPUInfo(cpu_info, 0);
+ max_cpuid_value = cpu_info[0];
+ if (max_cpuid_value < 1) {
+ return 0;
+ } else {
+ const int VENDOR_ID_INTEL_EBX = 0x756e6547; // uneG
+ const int VENDOR_ID_INTEL_EDX = 0x49656e69; // Ieni
+ const int VENDOR_ID_INTEL_ECX = 0x6c65746e; // letn
+ is_intel = (cpu_info[1] == VENDOR_ID_INTEL_EBX &&
+ cpu_info[2] == VENDOR_ID_INTEL_ECX &&
+ cpu_info[3] == VENDOR_ID_INTEL_EDX); // genuine Intel?
+ }
+
+ GetCPUInfo(cpu_info, 1);
+ if (feature == kSSE2) {
+ return !!(cpu_info[3] & (1 << 26));
+ }
+ if (feature == kSSE3) {
+ return !!(cpu_info[2] & (1 << 0));
+ }
+ if (feature == kSlowSSSE3) {
+ if (is_intel && (cpu_info[2] & (1 << 9))) { // SSSE3?
+ return CheckSlowModel(cpu_info[0]);
+ }
+ return 0;
+ }
+
+ if (feature == kSSE4_1) {
+ return !!(cpu_info[2] & (1 << 19));
+ }
+ if (feature == kAVX) {
+ // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+ if ((cpu_info[2] & 0x18000000) == 0x18000000) {
+ // XMM state and YMM state enabled by the OS.
+ return (xgetbv() & 0x6) == 0x6;
+ }
+ }
+ if (feature == kAVX2) {
+ if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
+ GetCPUInfo(cpu_info, 7);
+ return !!(cpu_info[1] & (1 << 5));
+ }
+ }
+ return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
+#elif defined(WEBP_ANDROID_NEON) // NB: needs to be before generic NEON test.
+static int AndroidCPUInfo(CPUFeature feature) {
+ const AndroidCpuFamily cpu_family = android_getCpuFamily();
+ const uint64_t cpu_features = android_getCpuFeatures();
+ if (feature == kNEON) {
+ return cpu_family == ANDROID_CPU_FAMILY_ARM &&
+ (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
+ }
+ return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
+#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
+// Use compile flags as an indicator of SIMD support instead of a runtime check.
+static int wasmCPUInfo(CPUFeature feature) {
+ switch (feature) {
+#ifdef WEBP_HAVE_SSE2
+ case kSSE2:
+ return 1;
+#endif
+#ifdef WEBP_HAVE_SSE41
+ case kSSE3:
+ case kSlowSSSE3:
+ case kSSE4_1:
+ return 1;
+#endif
+#ifdef WEBP_HAVE_NEON
+ case kNEON:
+ return 1;
+#endif
+ default:
+ break;
+ }
+ return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
+#elif defined(WEBP_HAVE_NEON)
+// In most cases this function doesn't check for NEON support (it's assumed by
+// the configuration), but enables turning off NEON at runtime, for testing
+// purposes, by setting VP8DecGetCPUInfo = NULL.
+static int armCPUInfo(CPUFeature feature) {
+ if (feature != kNEON) return 0;
+#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
+ {
+ int has_neon = 0;
+ char line[200];
+ FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
+ if (cpuinfo == NULL) return 0;
+ while (fgets(line, sizeof(line), cpuinfo)) {
+ if (!strncmp(line, "Features", 8)) {
+ if (strstr(line, " neon ") != NULL) {
+ has_neon = 1;
+ break;
+ }
+ }
+ }
+ fclose(cpuinfo);
+ return has_neon;
+ }
+#else
+ return 1;
+#endif
+}
+VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \
+ defined(WEBP_USE_MSA)
+static int mipsCPUInfo(CPUFeature feature) {
+ if ((feature == kMIPS32) || (feature == kMIPSdspR2) || (feature == kMSA)) {
+ return 1;
+ } else {
+ return 0;
+ }
+
+}
+VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
+#else
+VP8CPUInfo VP8GetCPUInfo = NULL;
+#endif
diff --git a/media/libwebp/dsp/dec.c b/media/libwebp/dsp/dec.c
index a599d26bc0..5c94b6d403 100644
--- a/media/libwebp/dsp/dec.c
+++ b/media/libwebp/dsp/dec.c
@@ -807,10 +807,10 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8DspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8DspInitSSE41();
}
@@ -834,7 +834,7 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8DspInitNEON();
diff --git a/media/libwebp/dsp/dec_mips32.c b/media/libwebp/dsp/dec_mips32.c
new file mode 100644
index 0000000000..2d55214faa
--- /dev/null
+++ b/media/libwebp/dsp/dec_mips32.c
@@ -0,0 +1,587 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../dsp/mips_macro.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+static WEBP_INLINE int abs_mips32(int x) {
+ const int sign = x >> 31;
+ return (x ^ sign) - sign;
+}
+
+// 4 pixels in, 2 pixels out
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
+ const int a1 = VP8ksclip2[(a + 4) >> 3];
+ const int a2 = VP8ksclip2[(a + 3) >> 3];
+ p[-step] = VP8kclip1[p0 + a2];
+ p[ 0] = VP8kclip1[q0 - a1];
+}
+
+// 4 pixels in, 4 pixels out
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ const int a = 3 * (q0 - p0);
+ const int a1 = VP8ksclip2[(a + 4) >> 3];
+ const int a2 = VP8ksclip2[(a + 3) >> 3];
+ const int a3 = (a1 + 1) >> 1;
+ p[-2 * step] = VP8kclip1[p1 + a3];
+ p[- step] = VP8kclip1[p0 + a2];
+ p[ 0] = VP8kclip1[q0 - a1];
+ p[ step] = VP8kclip1[q1 - a3];
+}
+
+// 6 pixels in, 6 pixels out
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+ const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+ const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+ // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
+ const int a1 = (27 * a + 63) >> 7; // eq. to ((3 * a + 7) * 9) >> 7
+ const int a2 = (18 * a + 63) >> 7; // eq. to ((2 * a + 7) * 9) >> 7
+ const int a3 = (9 * a + 63) >> 7; // eq. to ((1 * a + 7) * 9) >> 7
+ p[-3 * step] = VP8kclip1[p2 + a3];
+ p[-2 * step] = VP8kclip1[p1 + a2];
+ p[- step] = VP8kclip1[p0 + a1];
+ p[ 0] = VP8kclip1[q0 - a1];
+ p[ step] = VP8kclip1[q1 - a2];
+ p[ 2 * step] = VP8kclip1[q2 - a3];
+}
+
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
+}
+
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
+}
+
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+ int step, int t, int it) {
+ const int p3 = p[-4 * step], p2 = p[-3 * step];
+ const int p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+ if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
+ return 0;
+ }
+ return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
+ abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
+ abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+ int hstride, int vstride, int size,
+ int thresh, int ithresh, int hev_thresh) {
+ const int thresh2 = 2 * thresh + 1;
+ while (size-- > 0) {
+ if (needs_filter2(p, hstride, thresh2, ithresh)) {
+ if (hev(p, hstride, hev_thresh)) {
+ do_filter2(p, hstride);
+ } else {
+ do_filter6(p, hstride);
+ }
+ }
+ p += vstride;
+ }
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+ int hstride, int vstride, int size,
+ int thresh, int ithresh, int hev_thresh) {
+ const int thresh2 = 2 * thresh + 1;
+ while (size-- > 0) {
+ if (needs_filter2(p, hstride, thresh2, ithresh)) {
+ if (hev(p, hstride, hev_thresh)) {
+ do_filter2(p, hstride);
+ } else {
+ do_filter4(p, hstride);
+ }
+ }
+ p += vstride;
+ }
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+ FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+ FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+ FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+ FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ int k;
+ for (k = 3; k > 0; --k) {
+ p += 4 * stride;
+ FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+ }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ int k;
+ for (k = 3; k > 0; --k) {
+ p += 4;
+ FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+ int i;
+ const int thresh2 = 2 * thresh + 1;
+ for (i = 0; i < 16; ++i) {
+ if (needs_filter(p + i, stride, thresh2)) {
+ do_filter2(p + i, stride);
+ }
+ }
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+ int i;
+ const int thresh2 = 2 * thresh + 1;
+ for (i = 0; i < 16; ++i) {
+ if (needs_filter(p + i * stride, 1, thresh2)) {
+ do_filter2(p + i * stride, 1);
+ }
+ }
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+ int k;
+ for (k = 3; k > 0; --k) {
+ p += 4 * stride;
+ SimpleVFilter16(p, stride, thresh);
+ }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+ int k;
+ for (k = 3; k > 0; --k) {
+ p += 4;
+ SimpleHFilter16(p, stride, thresh);
+ }
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+ int temp0, temp1, temp2, temp3, temp4;
+ int temp5, temp6, temp7, temp8, temp9;
+ int temp10, temp11, temp12, temp13, temp14;
+ int temp15, temp16, temp17, temp18;
+ int16_t* p_in = (int16_t*)in;
+
+ // loops unrolled and merged to avoid usage of tmp buffer
+ // and to reduce number of stalls. MUL macro is written
+ // in assembler and inlined
+ __asm__ volatile(
+ "lh %[temp0], 0(%[in]) \n\t"
+ "lh %[temp8], 16(%[in]) \n\t"
+ "lh %[temp4], 8(%[in]) \n\t"
+ "lh %[temp12], 24(%[in]) \n\t"
+ "addu %[temp16], %[temp0], %[temp8] \n\t"
+ "subu %[temp0], %[temp0], %[temp8] \n\t"
+ "mul %[temp8], %[temp4], %[kC2] \n\t"
+ "mul %[temp17], %[temp12], %[kC1] \n\t"
+ "mul %[temp4], %[temp4], %[kC1] \n\t"
+ "mul %[temp12], %[temp12], %[kC2] \n\t"
+ "lh %[temp1], 2(%[in]) \n\t"
+ "lh %[temp5], 10(%[in]) \n\t"
+ "lh %[temp9], 18(%[in]) \n\t"
+ "lh %[temp13], 26(%[in]) \n\t"
+ "sra %[temp8], %[temp8], 16 \n\t"
+ "sra %[temp17], %[temp17], 16 \n\t"
+ "sra %[temp4], %[temp4], 16 \n\t"
+ "sra %[temp12], %[temp12], 16 \n\t"
+ "lh %[temp2], 4(%[in]) \n\t"
+ "lh %[temp6], 12(%[in]) \n\t"
+ "lh %[temp10], 20(%[in]) \n\t"
+ "lh %[temp14], 28(%[in]) \n\t"
+ "subu %[temp17], %[temp8], %[temp17] \n\t"
+ "addu %[temp4], %[temp4], %[temp12] \n\t"
+ "addu %[temp8], %[temp16], %[temp4] \n\t"
+ "subu %[temp4], %[temp16], %[temp4] \n\t"
+ "addu %[temp16], %[temp1], %[temp9] \n\t"
+ "subu %[temp1], %[temp1], %[temp9] \n\t"
+ "lh %[temp3], 6(%[in]) \n\t"
+ "lh %[temp7], 14(%[in]) \n\t"
+ "lh %[temp11], 22(%[in]) \n\t"
+ "lh %[temp15], 30(%[in]) \n\t"
+ "addu %[temp12], %[temp0], %[temp17] \n\t"
+ "subu %[temp0], %[temp0], %[temp17] \n\t"
+ "mul %[temp9], %[temp5], %[kC2] \n\t"
+ "mul %[temp17], %[temp13], %[kC1] \n\t"
+ "mul %[temp5], %[temp5], %[kC1] \n\t"
+ "mul %[temp13], %[temp13], %[kC2] \n\t"
+ "sra %[temp9], %[temp9], 16 \n\t"
+ "sra %[temp17], %[temp17], 16 \n\t"
+ "subu %[temp17], %[temp9], %[temp17] \n\t"
+ "sra %[temp5], %[temp5], 16 \n\t"
+ "sra %[temp13], %[temp13], 16 \n\t"
+ "addu %[temp5], %[temp5], %[temp13] \n\t"
+ "addu %[temp13], %[temp1], %[temp17] \n\t"
+ "subu %[temp1], %[temp1], %[temp17] \n\t"
+ "mul %[temp17], %[temp14], %[kC1] \n\t"
+ "mul %[temp14], %[temp14], %[kC2] \n\t"
+ "addu %[temp9], %[temp16], %[temp5] \n\t"
+ "subu %[temp5], %[temp16], %[temp5] \n\t"
+ "addu %[temp16], %[temp2], %[temp10] \n\t"
+ "subu %[temp2], %[temp2], %[temp10] \n\t"
+ "mul %[temp10], %[temp6], %[kC2] \n\t"
+ "mul %[temp6], %[temp6], %[kC1] \n\t"
+ "sra %[temp17], %[temp17], 16 \n\t"
+ "sra %[temp14], %[temp14], 16 \n\t"
+ "sra %[temp10], %[temp10], 16 \n\t"
+ "sra %[temp6], %[temp6], 16 \n\t"
+ "subu %[temp17], %[temp10], %[temp17] \n\t"
+ "addu %[temp6], %[temp6], %[temp14] \n\t"
+ "addu %[temp10], %[temp16], %[temp6] \n\t"
+ "subu %[temp6], %[temp16], %[temp6] \n\t"
+ "addu %[temp14], %[temp2], %[temp17] \n\t"
+ "subu %[temp2], %[temp2], %[temp17] \n\t"
+ "mul %[temp17], %[temp15], %[kC1] \n\t"
+ "mul %[temp15], %[temp15], %[kC2] \n\t"
+ "addu %[temp16], %[temp3], %[temp11] \n\t"
+ "subu %[temp3], %[temp3], %[temp11] \n\t"
+ "mul %[temp11], %[temp7], %[kC2] \n\t"
+ "mul %[temp7], %[temp7], %[kC1] \n\t"
+ "addiu %[temp8], %[temp8], 4 \n\t"
+ "addiu %[temp12], %[temp12], 4 \n\t"
+ "addiu %[temp0], %[temp0], 4 \n\t"
+ "addiu %[temp4], %[temp4], 4 \n\t"
+ "sra %[temp17], %[temp17], 16 \n\t"
+ "sra %[temp15], %[temp15], 16 \n\t"
+ "sra %[temp11], %[temp11], 16 \n\t"
+ "sra %[temp7], %[temp7], 16 \n\t"
+ "subu %[temp17], %[temp11], %[temp17] \n\t"
+ "addu %[temp7], %[temp7], %[temp15] \n\t"
+ "addu %[temp15], %[temp3], %[temp17] \n\t"
+ "subu %[temp3], %[temp3], %[temp17] \n\t"
+ "addu %[temp11], %[temp16], %[temp7] \n\t"
+ "subu %[temp7], %[temp16], %[temp7] \n\t"
+ "addu %[temp16], %[temp8], %[temp10] \n\t"
+ "subu %[temp8], %[temp8], %[temp10] \n\t"
+ "mul %[temp10], %[temp9], %[kC2] \n\t"
+ "mul %[temp17], %[temp11], %[kC1] \n\t"
+ "mul %[temp9], %[temp9], %[kC1] \n\t"
+ "mul %[temp11], %[temp11], %[kC2] \n\t"
+ "sra %[temp10], %[temp10], 16 \n\t"
+ "sra %[temp17], %[temp17], 16 \n\t"
+ "sra %[temp9], %[temp9], 16 \n\t"
+ "sra %[temp11], %[temp11], 16 \n\t"
+ "subu %[temp17], %[temp10], %[temp17] \n\t"
+ "addu %[temp11], %[temp9], %[temp11] \n\t"
+ "addu %[temp10], %[temp12], %[temp14] \n\t"
+ "subu %[temp12], %[temp12], %[temp14] \n\t"
+ "mul %[temp14], %[temp13], %[kC2] \n\t"
+ "mul %[temp9], %[temp15], %[kC1] \n\t"
+ "mul %[temp13], %[temp13], %[kC1] \n\t"
+ "mul %[temp15], %[temp15], %[kC2] \n\t"
+ "sra %[temp14], %[temp14], 16 \n\t"
+ "sra %[temp9], %[temp9], 16 \n\t"
+ "sra %[temp13], %[temp13], 16 \n\t"
+ "sra %[temp15], %[temp15], 16 \n\t"
+ "subu %[temp9], %[temp14], %[temp9] \n\t"
+ "addu %[temp15], %[temp13], %[temp15] \n\t"
+ "addu %[temp14], %[temp0], %[temp2] \n\t"
+ "subu %[temp0], %[temp0], %[temp2] \n\t"
+ "mul %[temp2], %[temp1], %[kC2] \n\t"
+ "mul %[temp13], %[temp3], %[kC1] \n\t"
+ "mul %[temp1], %[temp1], %[kC1] \n\t"
+ "mul %[temp3], %[temp3], %[kC2] \n\t"
+ "sra %[temp2], %[temp2], 16 \n\t"
+ "sra %[temp13], %[temp13], 16 \n\t"
+ "sra %[temp1], %[temp1], 16 \n\t"
+ "sra %[temp3], %[temp3], 16 \n\t"
+ "subu %[temp13], %[temp2], %[temp13] \n\t"
+ "addu %[temp3], %[temp1], %[temp3] \n\t"
+ "addu %[temp2], %[temp4], %[temp6] \n\t"
+ "subu %[temp4], %[temp4], %[temp6] \n\t"
+ "mul %[temp6], %[temp5], %[kC2] \n\t"
+ "mul %[temp1], %[temp7], %[kC1] \n\t"
+ "mul %[temp5], %[temp5], %[kC1] \n\t"
+ "mul %[temp7], %[temp7], %[kC2] \n\t"
+ "sra %[temp6], %[temp6], 16 \n\t"
+ "sra %[temp1], %[temp1], 16 \n\t"
+ "sra %[temp5], %[temp5], 16 \n\t"
+ "sra %[temp7], %[temp7], 16 \n\t"
+ "subu %[temp1], %[temp6], %[temp1] \n\t"
+ "addu %[temp7], %[temp5], %[temp7] \n\t"
+ "addu %[temp5], %[temp16], %[temp11] \n\t"
+ "subu %[temp16], %[temp16], %[temp11] \n\t"
+ "addu %[temp11], %[temp8], %[temp17] \n\t"
+ "subu %[temp8], %[temp8], %[temp17] \n\t"
+ "sra %[temp5], %[temp5], 3 \n\t"
+ "sra %[temp16], %[temp16], 3 \n\t"
+ "sra %[temp11], %[temp11], 3 \n\t"
+ "sra %[temp8], %[temp8], 3 \n\t"
+ "addu %[temp17], %[temp10], %[temp15] \n\t"
+ "subu %[temp10], %[temp10], %[temp15] \n\t"
+ "addu %[temp15], %[temp12], %[temp9] \n\t"
+ "subu %[temp12], %[temp12], %[temp9] \n\t"
+ "sra %[temp17], %[temp17], 3 \n\t"
+ "sra %[temp10], %[temp10], 3 \n\t"
+ "sra %[temp15], %[temp15], 3 \n\t"
+ "sra %[temp12], %[temp12], 3 \n\t"
+ "addu %[temp9], %[temp14], %[temp3] \n\t"
+ "subu %[temp14], %[temp14], %[temp3] \n\t"
+ "addu %[temp3], %[temp0], %[temp13] \n\t"
+ "subu %[temp0], %[temp0], %[temp13] \n\t"
+ "sra %[temp9], %[temp9], 3 \n\t"
+ "sra %[temp14], %[temp14], 3 \n\t"
+ "sra %[temp3], %[temp3], 3 \n\t"
+ "sra %[temp0], %[temp0], 3 \n\t"
+ "addu %[temp13], %[temp2], %[temp7] \n\t"
+ "subu %[temp2], %[temp2], %[temp7] \n\t"
+ "addu %[temp7], %[temp4], %[temp1] \n\t"
+ "subu %[temp4], %[temp4], %[temp1] \n\t"
+ "sra %[temp13], %[temp13], 3 \n\t"
+ "sra %[temp2], %[temp2], 3 \n\t"
+ "sra %[temp7], %[temp7], 3 \n\t"
+ "sra %[temp4], %[temp4], 3 \n\t"
+ "addiu %[temp6], $zero, 255 \n\t"
+ "lbu %[temp1], 0+0*" XSTR(BPS) "(%[dst]) \n\t"
+ "addu %[temp1], %[temp1], %[temp5] \n\t"
+ "sra %[temp5], %[temp1], 8 \n\t"
+ "sra %[temp18], %[temp1], 31 \n\t"
+ "beqz %[temp5], 1f \n\t"
+ "xor %[temp1], %[temp1], %[temp1] \n\t"
+ "movz %[temp1], %[temp6], %[temp18] \n\t"
+ "1: \n\t"
+ "lbu %[temp18], 1+0*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp1], 0+0*" XSTR(BPS) "(%[dst]) \n\t"
+ "addu %[temp18], %[temp18], %[temp11] \n\t"
+ "sra %[temp11], %[temp18], 8 \n\t"
+ "sra %[temp1], %[temp18], 31 \n\t"
+ "beqz %[temp11], 2f \n\t"
+ "xor %[temp18], %[temp18], %[temp18] \n\t"
+ "movz %[temp18], %[temp6], %[temp1] \n\t"
+ "2: \n\t"
+ "lbu %[temp1], 2+0*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp18], 1+0*" XSTR(BPS) "(%[dst]) \n\t"
+ "addu %[temp1], %[temp1], %[temp8] \n\t"
+ "sra %[temp8], %[temp1], 8 \n\t"
+ "sra %[temp18], %[temp1], 31 \n\t"
+ "beqz %[temp8], 3f \n\t"
+ "xor %[temp1], %[temp1], %[temp1] \n\t"
+ "movz %[temp1], %[temp6], %[temp18] \n\t"
+ "3: \n\t"
+ "lbu %[temp18], 3+0*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp1], 2+0*" XSTR(BPS) "(%[dst]) \n\t"
+ "addu %[temp18], %[temp18], %[temp16] \n\t"
+ "sra %[temp16], %[temp18], 8 \n\t"
+ "sra %[temp1], %[temp18], 31 \n\t"
+ "beqz %[temp16], 4f \n\t"
+ "xor %[temp18], %[temp18], %[temp18] \n\t"
+ "movz %[temp18], %[temp6], %[temp1] \n\t"
+ "4: \n\t"
+ "sb %[temp18], 3+0*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp5], 0+1*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp8], 1+1*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp11], 2+1*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp16], 3+1*" XSTR(BPS) "(%[dst]) \n\t"
+ "addu %[temp5], %[temp5], %[temp17] \n\t"
+ "addu %[temp8], %[temp8], %[temp15] \n\t"
+ "addu %[temp11], %[temp11], %[temp12] \n\t"
+ "addu %[temp16], %[temp16], %[temp10] \n\t"
+ "sra %[temp18], %[temp5], 8 \n\t"
+ "sra %[temp1], %[temp5], 31 \n\t"
+ "beqz %[temp18], 5f \n\t"
+ "xor %[temp5], %[temp5], %[temp5] \n\t"
+ "movz %[temp5], %[temp6], %[temp1] \n\t"
+ "5: \n\t"
+ "sra %[temp18], %[temp8], 8 \n\t"
+ "sra %[temp1], %[temp8], 31 \n\t"
+ "beqz %[temp18], 6f \n\t"
+ "xor %[temp8], %[temp8], %[temp8] \n\t"
+ "movz %[temp8], %[temp6], %[temp1] \n\t"
+ "6: \n\t"
+ "sra %[temp18], %[temp11], 8 \n\t"
+ "sra %[temp1], %[temp11], 31 \n\t"
+ "sra %[temp17], %[temp16], 8 \n\t"
+ "sra %[temp15], %[temp16], 31 \n\t"
+ "beqz %[temp18], 7f \n\t"
+ "xor %[temp11], %[temp11], %[temp11] \n\t"
+ "movz %[temp11], %[temp6], %[temp1] \n\t"
+ "7: \n\t"
+ "beqz %[temp17], 8f \n\t"
+ "xor %[temp16], %[temp16], %[temp16] \n\t"
+ "movz %[temp16], %[temp6], %[temp15] \n\t"
+ "8: \n\t"
+ "sb %[temp5], 0+1*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp8], 1+1*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp11], 2+1*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp16], 3+1*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp5], 0+2*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp8], 1+2*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp11], 2+2*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp16], 3+2*" XSTR(BPS) "(%[dst]) \n\t"
+ "addu %[temp5], %[temp5], %[temp9] \n\t"
+ "addu %[temp8], %[temp8], %[temp3] \n\t"
+ "addu %[temp11], %[temp11], %[temp0] \n\t"
+ "addu %[temp16], %[temp16], %[temp14] \n\t"
+ "sra %[temp18], %[temp5], 8 \n\t"
+ "sra %[temp1], %[temp5], 31 \n\t"
+ "sra %[temp17], %[temp8], 8 \n\t"
+ "sra %[temp15], %[temp8], 31 \n\t"
+ "sra %[temp12], %[temp11], 8 \n\t"
+ "sra %[temp10], %[temp11], 31 \n\t"
+ "sra %[temp9], %[temp16], 8 \n\t"
+ "sra %[temp3], %[temp16], 31 \n\t"
+ "beqz %[temp18], 9f \n\t"
+ "xor %[temp5], %[temp5], %[temp5] \n\t"
+ "movz %[temp5], %[temp6], %[temp1] \n\t"
+ "9: \n\t"
+ "beqz %[temp17], 10f \n\t"
+ "xor %[temp8], %[temp8], %[temp8] \n\t"
+ "movz %[temp8], %[temp6], %[temp15] \n\t"
+ "10: \n\t"
+ "beqz %[temp12], 11f \n\t"
+ "xor %[temp11], %[temp11], %[temp11] \n\t"
+ "movz %[temp11], %[temp6], %[temp10] \n\t"
+ "11: \n\t"
+ "beqz %[temp9], 12f \n\t"
+ "xor %[temp16], %[temp16], %[temp16] \n\t"
+ "movz %[temp16], %[temp6], %[temp3] \n\t"
+ "12: \n\t"
+ "sb %[temp5], 0+2*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp8], 1+2*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp11], 2+2*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp16], 3+2*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp5], 0+3*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp8], 1+3*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp11], 2+3*" XSTR(BPS) "(%[dst]) \n\t"
+ "lbu %[temp16], 3+3*" XSTR(BPS) "(%[dst]) \n\t"
+ "addu %[temp5], %[temp5], %[temp13] \n\t"
+ "addu %[temp8], %[temp8], %[temp7] \n\t"
+ "addu %[temp11], %[temp11], %[temp4] \n\t"
+ "addu %[temp16], %[temp16], %[temp2] \n\t"
+ "sra %[temp18], %[temp5], 8 \n\t"
+ "sra %[temp1], %[temp5], 31 \n\t"
+ "sra %[temp17], %[temp8], 8 \n\t"
+ "sra %[temp15], %[temp8], 31 \n\t"
+ "sra %[temp12], %[temp11], 8 \n\t"
+ "sra %[temp10], %[temp11], 31 \n\t"
+ "sra %[temp9], %[temp16], 8 \n\t"
+ "sra %[temp3], %[temp16], 31 \n\t"
+ "beqz %[temp18], 13f \n\t"
+ "xor %[temp5], %[temp5], %[temp5] \n\t"
+ "movz %[temp5], %[temp6], %[temp1] \n\t"
+ "13: \n\t"
+ "beqz %[temp17], 14f \n\t"
+ "xor %[temp8], %[temp8], %[temp8] \n\t"
+ "movz %[temp8], %[temp6], %[temp15] \n\t"
+ "14: \n\t"
+ "beqz %[temp12], 15f \n\t"
+ "xor %[temp11], %[temp11], %[temp11] \n\t"
+ "movz %[temp11], %[temp6], %[temp10] \n\t"
+ "15: \n\t"
+ "beqz %[temp9], 16f \n\t"
+ "xor %[temp16], %[temp16], %[temp16] \n\t"
+ "movz %[temp16], %[temp6], %[temp3] \n\t"
+ "16: \n\t"
+ "sb %[temp5], 0+3*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp8], 1+3*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp11], 2+3*" XSTR(BPS) "(%[dst]) \n\t"
+ "sb %[temp16], 3+3*" XSTR(BPS) "(%[dst]) \n\t"
+
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+ [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+ [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+ [temp18]"=&r"(temp18)
+ : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
+ : "memory", "hi", "lo"
+ );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+ TransformOne(in, dst);
+ if (do_two) {
+ TransformOne(in + 16, dst + 4);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
+ VP8InitClipTables();
+
+ VP8Transform = TransformTwo;
+
+ VP8VFilter16 = VFilter16;
+ VP8HFilter16 = HFilter16;
+ VP8VFilter8 = VFilter8;
+ VP8HFilter8 = HFilter8;
+ VP8VFilter16i = VFilter16i;
+ VP8HFilter16i = HFilter16i;
+ VP8VFilter8i = VFilter8i;
+ VP8HFilter8i = HFilter8i;
+
+ VP8SimpleVFilter16 = SimpleVFilter16;
+ VP8SimpleHFilter16 = SimpleHFilter16;
+ VP8SimpleVFilter16i = SimpleVFilter16i;
+ VP8SimpleHFilter16i = SimpleHFilter16i;
+}
+
+#else // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
+
+#endif // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/dec_mips_dsp_r2.c b/media/libwebp/dsp/dec_mips_dsp_r2.c
new file mode 100644
index 0000000000..dcc3041019
--- /dev/null
+++ b/media/libwebp/dsp/dec_mips_dsp_r2.c
@@ -0,0 +1,994 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/mips_macro.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+ int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
+
+ __asm__ volatile (
+ LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3,
+ BPS)
+ "lh %[temp5], 0(%[in]) \n\t"
+ "addiu %[temp5], %[temp5], 4 \n\t"
+ "ins %[temp5], %[temp5], 16, 16 \n\t"
+ "shra.ph %[temp5], %[temp5], 3 \n\t"
+ CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
+ temp3, temp1, temp2, temp3, temp4)
+ STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
+ temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
+ dst, 0, 1, 2, 3, BPS)
+
+ OUTPUT_EARLY_CLOBBER_REGS_10()
+ : [in]"r"(in), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+ const int a = in[0] + 4;
+ int c4 = MUL(in[4], kC2);
+ const int d4 = MUL(in[4], kC1);
+ const int c1 = MUL(in[1], kC2);
+ const int d1 = MUL(in[1], kC1);
+ int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+ int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+ __asm__ volatile (
+ "ins %[c4], %[d4], 16, 16 \n\t"
+ "replv.ph %[temp1], %[a] \n\t"
+ "replv.ph %[temp4], %[d1] \n\t"
+ ADD_SUB_HALVES(temp2, temp3, temp1, c4)
+ "replv.ph %[temp5], %[c1] \n\t"
+ SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
+ temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
+ LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3,
+ BPS)
+ CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
+ temp11, temp17, temp3, temp5, temp11, temp12)
+ PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
+ temp4, temp7, temp6, temp10, temp9)
+ STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
+ temp17, temp12, temp18, temp1, temp8, temp2, temp4,
+ temp7, temp6, dst, 0, 1, 2, 3, BPS)
+
+ OUTPUT_EARLY_CLOBBER_REGS_18(),
+ [c4]"+&r"(c4)
+ : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
+ : "memory"
+ );
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+ int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+ int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+ __asm__ volatile (
+ "ulw %[temp1], 0(%[in]) \n\t"
+ "ulw %[temp2], 16(%[in]) \n\t"
+ LOAD_IN_X2(temp5, temp6, 24, 26)
+ ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+ LOAD_IN_X2(temp1, temp2, 8, 10)
+ MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+ temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+ temp13, temp11, temp14, temp12)
+ INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+ "ulw %[temp17], 4(%[in]) \n\t"
+ "ulw %[temp18], 20(%[in]) \n\t"
+ ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+ ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+ ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+ LOAD_IN_X2(temp17, temp18, 12, 14)
+ LOAD_IN_X2(temp9, temp10, 28, 30)
+ MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+ temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+ temp15, temp4, temp16, temp17)
+ INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+ ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+ ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+ // horizontal
+ SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+ INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+ SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+ "repl.ph %[temp2], 0x4 \n\t"
+ INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+ "addq.ph %[temp1], %[temp1], %[temp2] \n\t"
+ "addq.ph %[temp6], %[temp6], %[temp2] \n\t"
+ ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+ ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+ MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+ temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+ temp6, temp17, temp8, temp18)
+ MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+ temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+ temp18, temp12, temp17, temp16)
+ INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+ INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+ SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+ temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+ temp6)
+ PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+ temp16, temp11, temp10, temp15, temp14)
+ LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3,
+ BPS)
+ CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+ temp11, temp10, temp11, temp14, temp15)
+ STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+ temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+ dst, 0, 1, 2, 3, BPS)
+
+ OUTPUT_EARLY_CLOBBER_REGS_18()
+ : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
+ : "memory", "hi", "lo"
+ );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+ TransformOne(in, dst);
+ if (do_two) {
+ TransformOne(in + 16, dst + 4);
+ }
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+ int hstride, int vstride, int size,
+ int thresh, int ithresh, int hev_thresh) {
+ const int thresh2 = 2 * thresh + 1;
+ int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+ int temp10, temp11, temp12, temp13, temp14, temp15;
+
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "1: \n\t"
+ "negu %[temp1], %[hstride] \n\t"
+ "addiu %[size], %[size], -1 \n\t"
+ "sll %[temp2], %[hstride], 1 \n\t"
+ "sll %[temp3], %[temp1], 1 \n\t"
+ "addu %[temp4], %[temp2], %[hstride] \n\t"
+ "addu %[temp5], %[temp3], %[temp1] \n\t"
+ "lbu %[temp7], 0(%[p]) \n\t"
+ "sll %[temp6], %[temp3], 1 \n\t"
+ "lbux %[temp8], %[temp5](%[p]) \n\t"
+ "lbux %[temp9], %[temp3](%[p]) \n\t"
+ "lbux %[temp10], %[temp1](%[p]) \n\t"
+ "lbux %[temp11], %[temp6](%[p]) \n\t"
+ "lbux %[temp12], %[hstride](%[p]) \n\t"
+ "lbux %[temp13], %[temp2](%[p]) \n\t"
+ "lbux %[temp14], %[temp4](%[p]) \n\t"
+ "subu %[temp1], %[temp10], %[temp7] \n\t"
+ "subu %[temp2], %[temp9], %[temp12] \n\t"
+ "absq_s.w %[temp3], %[temp1] \n\t"
+ "absq_s.w %[temp4], %[temp2] \n\t"
+ "negu %[temp1], %[temp1] \n\t"
+ "sll %[temp3], %[temp3], 2 \n\t"
+ "addu %[temp15], %[temp3], %[temp4] \n\t"
+ "subu %[temp3], %[temp15], %[thresh2] \n\t"
+ "sll %[temp6], %[temp1], 1 \n\t"
+ "bgtz %[temp3], 3f \n\t"
+ " subu %[temp4], %[temp11], %[temp8] \n\t"
+ "absq_s.w %[temp4], %[temp4] \n\t"
+ "shll_s.w %[temp2], %[temp2], 24 \n\t"
+ "subu %[temp4], %[temp4], %[ithresh] \n\t"
+ "bgtz %[temp4], 3f \n\t"
+ " subu %[temp3], %[temp8], %[temp9] \n\t"
+ "absq_s.w %[temp3], %[temp3] \n\t"
+ "subu %[temp3], %[temp3], %[ithresh] \n\t"
+ "bgtz %[temp3], 3f \n\t"
+ " subu %[temp5], %[temp9], %[temp10] \n\t"
+ "absq_s.w %[temp3], %[temp5] \n\t"
+ "absq_s.w %[temp5], %[temp5] \n\t"
+ "subu %[temp3], %[temp3], %[ithresh] \n\t"
+ "bgtz %[temp3], 3f \n\t"
+ " subu %[temp3], %[temp14], %[temp13] \n\t"
+ "absq_s.w %[temp3], %[temp3] \n\t"
+ "slt %[temp5], %[hev_thresh], %[temp5] \n\t"
+ "subu %[temp3], %[temp3], %[ithresh] \n\t"
+ "bgtz %[temp3], 3f \n\t"
+ " subu %[temp3], %[temp13], %[temp12] \n\t"
+ "absq_s.w %[temp3], %[temp3] \n\t"
+ "sra %[temp4], %[temp2], 24 \n\t"
+ "subu %[temp3], %[temp3], %[ithresh] \n\t"
+ "bgtz %[temp3], 3f \n\t"
+ " subu %[temp15], %[temp12], %[temp7] \n\t"
+ "absq_s.w %[temp3], %[temp15] \n\t"
+ "absq_s.w %[temp15], %[temp15] \n\t"
+ "subu %[temp3], %[temp3], %[ithresh] \n\t"
+ "bgtz %[temp3], 3f \n\t"
+ " slt %[temp15], %[hev_thresh], %[temp15] \n\t"
+ "addu %[temp3], %[temp6], %[temp1] \n\t"
+ "or %[temp2], %[temp5], %[temp15] \n\t"
+ "addu %[temp5], %[temp4], %[temp3] \n\t"
+ "beqz %[temp2], 4f \n\t"
+ " shra_r.w %[temp1], %[temp5], 3 \n\t"
+ "addiu %[temp2], %[temp5], 3 \n\t"
+ "sra %[temp2], %[temp2], 3 \n\t"
+ "shll_s.w %[temp1], %[temp1], 27 \n\t"
+ "shll_s.w %[temp2], %[temp2], 27 \n\t"
+ "subu %[temp3], %[p], %[hstride] \n\t"
+ "sra %[temp1], %[temp1], 27 \n\t"
+ "sra %[temp2], %[temp2], 27 \n\t"
+ "subu %[temp1], %[temp7], %[temp1] \n\t"
+ "addu %[temp2], %[temp10], %[temp2] \n\t"
+ "lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"
+ "lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"
+ "sb %[temp2], 0(%[temp3]) \n\t"
+ "j 3f \n\t"
+ " sb %[temp1], 0(%[p]) \n\t"
+ "4: \n\t"
+ "shll_s.w %[temp5], %[temp5], 24 \n\t"
+ "subu %[temp14], %[p], %[hstride] \n\t"
+ "subu %[temp11], %[temp14], %[hstride] \n\t"
+ "sra %[temp6], %[temp5], 24 \n\t"
+ "sll %[temp1], %[temp6], 3 \n\t"
+ "subu %[temp15], %[temp11], %[hstride] \n\t"
+ "addu %[temp2], %[temp6], %[temp1] \n\t"
+ "sll %[temp3], %[temp2], 1 \n\t"
+ "addu %[temp4], %[temp3], %[temp2] \n\t"
+ "addiu %[temp2], %[temp2], 63 \n\t"
+ "addiu %[temp3], %[temp3], 63 \n\t"
+ "addiu %[temp4], %[temp4], 63 \n\t"
+ "sra %[temp2], %[temp2], 7 \n\t"
+ "sra %[temp3], %[temp3], 7 \n\t"
+ "sra %[temp4], %[temp4], 7 \n\t"
+ "addu %[temp1], %[temp8], %[temp2] \n\t"
+ "addu %[temp5], %[temp9], %[temp3] \n\t"
+ "addu %[temp6], %[temp10], %[temp4] \n\t"
+ "subu %[temp8], %[temp7], %[temp4] \n\t"
+ "subu %[temp7], %[temp12], %[temp3] \n\t"
+ "addu %[temp10], %[p], %[hstride] \n\t"
+ "subu %[temp9], %[temp13], %[temp2] \n\t"
+ "addu %[temp12], %[temp10], %[hstride] \n\t"
+ "lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"
+ "lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"
+ "lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"
+ "lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"
+ "lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"
+ "lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"
+ "sb %[temp2], 0(%[temp15]) \n\t"
+ "sb %[temp3], 0(%[temp11]) \n\t"
+ "sb %[temp4], 0(%[temp14]) \n\t"
+ "sb %[temp5], 0(%[p]) \n\t"
+ "sb %[temp6], 0(%[temp10]) \n\t"
+ "sb %[temp8], 0(%[temp12]) \n\t"
+ "3: \n\t"
+ "bgtz %[size], 1b \n\t"
+ " addu %[p], %[p], %[vstride] \n\t"
+ ".set pop \n\t"
+ : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+ [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
+ [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
+ [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
+ [size]"+&r"(size), [p]"+&r"(p)
+ : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
+ [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
+ [VP8kclip1]"r"(VP8kclip1)
+ : "memory"
+ );
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+ int hstride, int vstride, int size,
+ int thresh, int ithresh, int hev_thresh) {
+ int p0, q0, p1, q1, p2, q2, p3, q3;
+ int step1, step2, temp1, temp2, temp3, temp4;
+ uint8_t* pTemp0;
+ uint8_t* pTemp1;
+ const int thresh2 = 2 * thresh + 1;
+
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "bltz %[size], 3f \n\t"
+ " nop \n\t"
+ "2: \n\t"
+ "negu %[step1], %[hstride] \n\t"
+ "lbu %[q0], 0(%[p]) \n\t"
+ "lbux %[p0], %[step1](%[p]) \n\t"
+ "subu %[step1], %[step1], %[hstride] \n\t"
+ "lbux %[q1], %[hstride](%[p]) \n\t"
+ "subu %[temp1], %[p0], %[q0] \n\t"
+ "lbux %[p1], %[step1](%[p]) \n\t"
+ "addu %[step2], %[hstride], %[hstride] \n\t"
+ "absq_s.w %[temp2], %[temp1] \n\t"
+ "subu %[temp3], %[p1], %[q1] \n\t"
+ "absq_s.w %[temp4], %[temp3] \n\t"
+ "sll %[temp2], %[temp2], 2 \n\t"
+ "addu %[temp2], %[temp2], %[temp4] \n\t"
+ "subu %[temp4], %[temp2], %[thresh2] \n\t"
+ "subu %[step1], %[step1], %[hstride] \n\t"
+ "bgtz %[temp4], 0f \n\t"
+ " lbux %[p2], %[step1](%[p]) \n\t"
+ "subu %[step1], %[step1], %[hstride] \n\t"
+ "lbux %[q2], %[step2](%[p]) \n\t"
+ "lbux %[p3], %[step1](%[p]) \n\t"
+ "subu %[temp4], %[p2], %[p1] \n\t"
+ "addu %[step2], %[step2], %[hstride] \n\t"
+ "subu %[temp2], %[p3], %[p2] \n\t"
+ "absq_s.w %[temp4], %[temp4] \n\t"
+ "absq_s.w %[temp2], %[temp2] \n\t"
+ "lbux %[q3], %[step2](%[p]) \n\t"
+ "subu %[temp4], %[temp4], %[ithresh] \n\t"
+ "negu %[temp1], %[temp1] \n\t"
+ "bgtz %[temp4], 0f \n\t"
+ " subu %[temp2], %[temp2], %[ithresh] \n\t"
+ "subu %[p3], %[p1], %[p0] \n\t"
+ "bgtz %[temp2], 0f \n\t"
+ " absq_s.w %[p3], %[p3] \n\t"
+ "subu %[temp4], %[q3], %[q2] \n\t"
+ "subu %[pTemp0], %[p], %[hstride] \n\t"
+ "absq_s.w %[temp4], %[temp4] \n\t"
+ "subu %[temp2], %[p3], %[ithresh] \n\t"
+ "sll %[step1], %[temp1], 1 \n\t"
+ "bgtz %[temp2], 0f \n\t"
+ " subu %[temp4], %[temp4], %[ithresh] \n\t"
+ "subu %[temp2], %[q2], %[q1] \n\t"
+ "bgtz %[temp4], 0f \n\t"
+ " absq_s.w %[temp2], %[temp2] \n\t"
+ "subu %[q3], %[q1], %[q0] \n\t"
+ "absq_s.w %[q3], %[q3] \n\t"
+ "subu %[temp2], %[temp2], %[ithresh] \n\t"
+ "addu %[temp1], %[temp1], %[step1] \n\t"
+ "bgtz %[temp2], 0f \n\t"
+ " subu %[temp4], %[q3], %[ithresh] \n\t"
+ "slt %[p3], %[hev_thresh], %[p3] \n\t"
+ "bgtz %[temp4], 0f \n\t"
+ " slt %[q3], %[hev_thresh], %[q3] \n\t"
+ "or %[q3], %[q3], %[p3] \n\t"
+ "bgtz %[q3], 1f \n\t"
+ " shra_r.w %[temp2], %[temp1], 3 \n\t"
+ "addiu %[temp1], %[temp1], 3 \n\t"
+ "sra %[temp1], %[temp1], 3 \n\t"
+ "shll_s.w %[temp2], %[temp2], 27 \n\t"
+ "shll_s.w %[temp1], %[temp1], 27 \n\t"
+ "addu %[pTemp1], %[p], %[hstride] \n\t"
+ "sra %[temp2], %[temp2], 27 \n\t"
+ "sra %[temp1], %[temp1], 27 \n\t"
+ "addiu %[step1], %[temp2], 1 \n\t"
+ "sra %[step1], %[step1], 1 \n\t"
+ "addu %[p0], %[p0], %[temp1] \n\t"
+ "addu %[p1], %[p1], %[step1] \n\t"
+ "subu %[q0], %[q0], %[temp2] \n\t"
+ "subu %[q1], %[q1], %[step1] \n\t"
+ "lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"
+ "lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"
+ "lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"
+ "sb %[temp2], 0(%[pTemp0]) \n\t"
+ "lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"
+ "subu %[pTemp0], %[pTemp0], %[hstride] \n\t"
+ "sb %[temp3], 0(%[p]) \n\t"
+ "sb %[temp4], 0(%[pTemp1]) \n\t"
+ "j 0f \n\t"
+ " sb %[temp1], 0(%[pTemp0]) \n\t"
+ "1: \n\t"
+ "shll_s.w %[temp3], %[temp3], 24 \n\t"
+ "sra %[temp3], %[temp3], 24 \n\t"
+ "addu %[temp1], %[temp1], %[temp3] \n\t"
+ "shra_r.w %[temp2], %[temp1], 3 \n\t"
+ "addiu %[temp1], %[temp1], 3 \n\t"
+ "shll_s.w %[temp2], %[temp2], 27 \n\t"
+ "sra %[temp1], %[temp1], 3 \n\t"
+ "shll_s.w %[temp1], %[temp1], 27 \n\t"
+ "sra %[temp2], %[temp2], 27 \n\t"
+ "sra %[temp1], %[temp1], 27 \n\t"
+ "addu %[p0], %[p0], %[temp1] \n\t"
+ "subu %[q0], %[q0], %[temp2] \n\t"
+ "lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"
+ "lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"
+ "sb %[temp2], 0(%[p]) \n\t"
+ "sb %[temp1], 0(%[pTemp0]) \n\t"
+ "0: \n\t"
+ "subu %[size], %[size], 1 \n\t"
+ "bgtz %[size], 2b \n\t"
+ " addu %[p], %[p], %[vstride] \n\t"
+ "3: \n\t"
+ ".set pop \n\t"
+ : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
+ [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
+ [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+ [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
+ [size]"+&r"(size)
+ : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
+ [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
+ [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+ : "memory"
+ );
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+ FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+ FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ int k;
+ for (k = 3; k > 0; --k) {
+ p += 4 * stride;
+ FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+ }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ int k;
+ for (k = 3; k > 0; --k) {
+ p += 4;
+ FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+ }
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+ FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+ FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+#undef MUL
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+ int i;
+ const int thresh2 = 2 * thresh + 1;
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+ uint8_t* p1 = p - stride;
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "li %[i], 16 \n\t"
+ "0: \n\t"
+ "negu %[temp4], %[stride] \n\t"
+ "sll %[temp5], %[temp4], 1 \n\t"
+ "lbu %[temp2], 0(%[p]) \n\t"
+ "lbux %[temp3], %[stride](%[p]) \n\t"
+ "lbux %[temp1], %[temp4](%[p]) \n\t"
+ "lbux %[temp0], %[temp5](%[p]) \n\t"
+ "subu %[temp7], %[temp1], %[temp2] \n\t"
+ "subu %[temp6], %[temp0], %[temp3] \n\t"
+ "absq_s.w %[temp4], %[temp7] \n\t"
+ "absq_s.w %[temp5], %[temp6] \n\t"
+ "sll %[temp4], %[temp4], 2 \n\t"
+ "subu %[temp5], %[temp5], %[thresh2] \n\t"
+ "addu %[temp5], %[temp4], %[temp5] \n\t"
+ "negu %[temp8], %[temp7] \n\t"
+ "bgtz %[temp5], 1f \n\t"
+ " addiu %[i], %[i], -1 \n\t"
+ "sll %[temp4], %[temp8], 1 \n\t"
+ "shll_s.w %[temp5], %[temp6], 24 \n\t"
+ "addu %[temp3], %[temp4], %[temp8] \n\t"
+ "sra %[temp5], %[temp5], 24 \n\t"
+ "addu %[temp3], %[temp3], %[temp5] \n\t"
+ "addiu %[temp7], %[temp3], 3 \n\t"
+ "sra %[temp7], %[temp7], 3 \n\t"
+ "shra_r.w %[temp8], %[temp3], 3 \n\t"
+ "shll_s.w %[temp0], %[temp7], 27 \n\t"
+ "shll_s.w %[temp4], %[temp8], 27 \n\t"
+ "sra %[temp0], %[temp0], 27 \n\t"
+ "sra %[temp4], %[temp4], 27 \n\t"
+ "addu %[temp7], %[temp1], %[temp0] \n\t"
+ "subu %[temp2], %[temp2], %[temp4] \n\t"
+ "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
+ "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
+ "sb %[temp3], 0(%[p1]) \n\t"
+ "sb %[temp4], 0(%[p]) \n\t"
+ "1: \n\t"
+ "addiu %[p1], %[p1], 1 \n\t"
+ "bgtz %[i], 0b \n\t"
+ " addiu %[p], %[p], 1 \n\t"
+ " .set pop \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
+ : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+ : "memory"
+ );
+}
+
+// TEMP0 = SRC[A + A1 * BPS]
+// TEMP1 = SRC[B + B1 * BPS]
+// TEMP2 = SRC[C + C1 * BPS]
+// TEMP3 = SRC[D + D1 * BPS]
+#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \
+ A, A1, B, B1, C, C1, D, D1, SRC) \
+ "lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
+ "lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
+ "lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
+ "lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+ int i;
+ const int thresh2 = 2 * thresh + 1;
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "li %[i], 16 \n\t"
+ "0: \n\t"
+ LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
+ "subu %[temp7], %[temp1], %[temp2] \n\t"
+ "subu %[temp6], %[temp0], %[temp3] \n\t"
+ "absq_s.w %[temp4], %[temp7] \n\t"
+ "absq_s.w %[temp5], %[temp6] \n\t"
+ "sll %[temp4], %[temp4], 2 \n\t"
+ "addu %[temp5], %[temp4], %[temp5] \n\t"
+ "subu %[temp5], %[temp5], %[thresh2] \n\t"
+ "negu %[temp8], %[temp7] \n\t"
+ "bgtz %[temp5], 1f \n\t"
+ " addiu %[i], %[i], -1 \n\t"
+ "sll %[temp4], %[temp8], 1 \n\t"
+ "shll_s.w %[temp5], %[temp6], 24 \n\t"
+ "addu %[temp3], %[temp4], %[temp8] \n\t"
+ "sra %[temp5], %[temp5], 24 \n\t"
+ "addu %[temp3], %[temp3], %[temp5] \n\t"
+ "addiu %[temp7], %[temp3], 3 \n\t"
+ "sra %[temp7], %[temp7], 3 \n\t"
+ "shra_r.w %[temp8], %[temp3], 3 \n\t"
+ "shll_s.w %[temp0], %[temp7], 27 \n\t"
+ "shll_s.w %[temp4], %[temp8], 27 \n\t"
+ "sra %[temp0], %[temp0], 27 \n\t"
+ "sra %[temp4], %[temp4], 27 \n\t"
+ "addu %[temp7], %[temp1], %[temp0] \n\t"
+ "subu %[temp2], %[temp2], %[temp4] \n\t"
+ "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
+ "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
+ "sb %[temp3], -1(%[p]) \n\t"
+ "sb %[temp4], 0(%[p]) \n\t"
+ "1: \n\t"
+ "bgtz %[i], 0b \n\t"
+ " addu %[p], %[p], %[stride] \n\t"
+ ".set pop \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [p]"+&r"(p), [i]"=&r"(i)
+ : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+ : "memory"
+ );
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+ int k;
+ for (k = 3; k > 0; --k) {
+ p += 4 * stride;
+ SimpleVFilter16(p, stride, thresh);
+ }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+ int k;
+ for (k = 3; k > 0; --k) {
+ p += 4;
+ SimpleHFilter16(p, stride, thresh);
+ }
+}
+
+// DST[A * BPS] = TEMP0
+// DST[B + C * BPS] = TEMP1
+#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \
+ "usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \
+ "usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"
+
+static void VE4(uint8_t* dst) { // vertical
+ const uint8_t* top = dst - BPS;
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+ __asm__ volatile (
+ "ulw %[temp0], -1(%[top]) \n\t"
+ "ulh %[temp1], 3(%[top]) \n\t"
+ "preceu.ph.qbr %[temp2], %[temp0] \n\t"
+ "preceu.ph.qbl %[temp3], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp4], %[temp1] \n\t"
+ "packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
+ "packrl.ph %[temp6], %[temp4], %[temp3] \n\t"
+ "shll.ph %[temp5], %[temp5], 1 \n\t"
+ "shll.ph %[temp6], %[temp6], 1 \n\t"
+ "addq.ph %[temp2], %[temp5], %[temp2] \n\t"
+ "addq.ph %[temp6], %[temp6], %[temp4] \n\t"
+ "addq.ph %[temp2], %[temp2], %[temp3] \n\t"
+ "addq.ph %[temp6], %[temp6], %[temp3] \n\t"
+ "shra_r.ph %[temp2], %[temp2], 2 \n\t"
+ "shra_r.ph %[temp6], %[temp6], 2 \n\t"
+ "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"
+ STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
+ STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void DC4(uint8_t* dst) { // DC
+ int temp0, temp1, temp2, temp3, temp4;
+ __asm__ volatile (
+ "ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"
+ LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+ "ins %[temp1], %[temp2], 8, 8 \n\t"
+ "ins %[temp1], %[temp3], 16, 8 \n\t"
+ "ins %[temp1], %[temp4], 24, 8 \n\t"
+ "raddu.w.qb %[temp0], %[temp0] \n\t"
+ "raddu.w.qb %[temp1], %[temp1] \n\t"
+ "addu %[temp0], %[temp0], %[temp1] \n\t"
+ "shra_r.w %[temp0], %[temp0], 3 \n\t"
+ "replv.qb %[temp0], %[temp0] \n\t"
+ STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
+ STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+ : [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void RD4(uint8_t* dst) { // Down-right
+ int temp0, temp1, temp2, temp3, temp4;
+ int temp5, temp6, temp7, temp8;
+ __asm__ volatile (
+ LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+ "ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"
+ "ins %[temp1], %[temp0], 16, 16 \n\t"
+ "preceu.ph.qbr %[temp5], %[temp7] \n\t"
+ "ins %[temp2], %[temp1], 16, 16 \n\t"
+ "preceu.ph.qbl %[temp4], %[temp7] \n\t"
+ "ins %[temp3], %[temp2], 16, 16 \n\t"
+ "shll.ph %[temp2], %[temp2], 1 \n\t"
+ "addq.ph %[temp3], %[temp3], %[temp1] \n\t"
+ "packrl.ph %[temp6], %[temp5], %[temp1] \n\t"
+ "addq.ph %[temp3], %[temp3], %[temp2] \n\t"
+ "addq.ph %[temp1], %[temp1], %[temp5] \n\t"
+ "shll.ph %[temp6], %[temp6], 1 \n\t"
+ "addq.ph %[temp1], %[temp1], %[temp6] \n\t"
+ "packrl.ph %[temp0], %[temp4], %[temp5] \n\t"
+ "addq.ph %[temp8], %[temp5], %[temp4] \n\t"
+ "shra_r.ph %[temp3], %[temp3], 2 \n\t"
+ "shll.ph %[temp0], %[temp0], 1 \n\t"
+ "shra_r.ph %[temp1], %[temp1], 2 \n\t"
+ "addq.ph %[temp8], %[temp0], %[temp8] \n\t"
+ "lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"
+ "precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"
+ "shra_r.ph %[temp8], %[temp8], 2 \n\t"
+ "ins %[temp7], %[temp5], 0, 8 \n\t"
+ "precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"
+ "raddu.w.qb %[temp4], %[temp7] \n\t"
+ "precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"
+ "shra_r.w %[temp4], %[temp4], 2 \n\t"
+ STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
+ "prepend %[temp2], %[temp8], 8 \n\t"
+ "prepend %[temp6], %[temp4], 8 \n\t"
+ STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+ : [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+// TEMP0 = SRC[A * BPS]
+// TEMP1 = SRC[B + C * BPS]
+#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \
+ "ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
+ "ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"
+
+static void LD4(uint8_t* dst) { // Down-Left
+ int temp0, temp1, temp2, temp3, temp4;
+ int temp5, temp6, temp7, temp8, temp9;
+ __asm__ volatile (
+ LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+ "preceu.ph.qbl %[temp2], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp3], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp4], %[temp1] \n\t"
+ "preceu.ph.qbl %[temp5], %[temp1] \n\t"
+ "packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
+ "packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
+ "packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
+ "shll.ph %[temp6], %[temp6], 1 \n\t"
+ "addq.ph %[temp9], %[temp2], %[temp6] \n\t"
+ "shll.ph %[temp7], %[temp7], 1 \n\t"
+ "addq.ph %[temp9], %[temp9], %[temp3] \n\t"
+ "shll.ph %[temp8], %[temp8], 1 \n\t"
+ "shra_r.ph %[temp9], %[temp9], 2 \n\t"
+ "addq.ph %[temp3], %[temp4], %[temp7] \n\t"
+ "addq.ph %[temp0], %[temp5], %[temp8] \n\t"
+ "addq.ph %[temp3], %[temp3], %[temp2] \n\t"
+ "addq.ph %[temp0], %[temp0], %[temp4] \n\t"
+ "shra_r.ph %[temp3], %[temp3], 2 \n\t"
+ "shra_r.ph %[temp0], %[temp0], 2 \n\t"
+ "srl %[temp1], %[temp1], 24 \n\t"
+ "sll %[temp1], %[temp1], 1 \n\t"
+ "raddu.w.qb %[temp5], %[temp5] \n\t"
+ "precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"
+ "precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"
+ "addu %[temp1], %[temp1], %[temp5] \n\t"
+ "shra_r.w %[temp1], %[temp1], 2 \n\t"
+ STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
+ "prepend %[temp9], %[temp0], 8 \n\t"
+ "prepend %[temp3], %[temp1], 8 \n\t"
+ STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9)
+ : [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void DC8uv(uint8_t* dst) { // DC
+ int temp0, temp1, temp2, temp3, temp4;
+ int temp5, temp6, temp7, temp8, temp9;
+ __asm__ volatile (
+ LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+ LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+ LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+ "raddu.w.qb %[temp0], %[temp0] \n\t"
+ "raddu.w.qb %[temp1], %[temp1] \n\t"
+ "addu %[temp2], %[temp2], %[temp3] \n\t"
+ "addu %[temp4], %[temp4], %[temp5] \n\t"
+ "addu %[temp6], %[temp6], %[temp7] \n\t"
+ "addu %[temp8], %[temp8], %[temp9] \n\t"
+ "addu %[temp0], %[temp0], %[temp1] \n\t"
+ "addu %[temp2], %[temp2], %[temp4] \n\t"
+ "addu %[temp6], %[temp6], %[temp8] \n\t"
+ "addu %[temp0], %[temp0], %[temp2] \n\t"
+ "addu %[temp0], %[temp0], %[temp6] \n\t"
+ "shra_r.w %[temp0], %[temp0], 4 \n\t"
+ "replv.qb %[temp0], %[temp0] \n\t"
+ STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+ STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+ STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+ STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+ STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+ STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+ STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+ STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9)
+ : [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
+ int temp0, temp1;
+ __asm__ volatile (
+ LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+ "raddu.w.qb %[temp0], %[temp0] \n\t"
+ "raddu.w.qb %[temp1], %[temp1] \n\t"
+ "addu %[temp0], %[temp0], %[temp1] \n\t"
+ "shra_r.w %[temp0], %[temp0], 3 \n\t"
+ "replv.qb %[temp0], %[temp0] \n\t"
+ STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+ STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+ STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+ STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+ STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+ STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+ STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+ STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+ : [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
+ int temp0, temp1, temp2, temp3, temp4;
+ int temp5, temp6, temp7, temp8;
+ __asm__ volatile (
+ LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+ LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+ "addu %[temp2], %[temp2], %[temp3] \n\t"
+ "addu %[temp4], %[temp4], %[temp5] \n\t"
+ "addu %[temp6], %[temp6], %[temp7] \n\t"
+ "addu %[temp8], %[temp8], %[temp1] \n\t"
+ "addu %[temp2], %[temp2], %[temp4] \n\t"
+ "addu %[temp6], %[temp6], %[temp8] \n\t"
+ "addu %[temp0], %[temp6], %[temp2] \n\t"
+ "shra_r.w %[temp0], %[temp0], 3 \n\t"
+ "replv.qb %[temp0], %[temp0] \n\t"
+ STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+ STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+ STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+ STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+ STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+ STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+ STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+ STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+ : [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+#undef LOAD_8_BYTES
+#undef STORE_8_BYTES
+#undef LOAD_4_BYTES
+
+#define CLIPPING(SIZE) \
+ "preceu.ph.qbl %[temp2], %[temp0] \n\t" \
+ "preceu.ph.qbr %[temp0], %[temp0] \n\t" \
+".if " #SIZE " == 8 \n\t" \
+ "preceu.ph.qbl %[temp3], %[temp1] \n\t" \
+ "preceu.ph.qbr %[temp1], %[temp1] \n\t" \
+".endif \n\t" \
+ "addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \
+ "addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \
+".if " #SIZE " == 8 \n\t" \
+ "addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \
+ "addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \
+".endif \n\t" \
+ "shll_s.ph %[temp2], %[temp2], 7 \n\t" \
+ "shll_s.ph %[temp0], %[temp0], 7 \n\t" \
+".if " #SIZE " == 8 \n\t" \
+ "shll_s.ph %[temp3], %[temp3], 7 \n\t" \
+ "shll_s.ph %[temp1], %[temp1], 7 \n\t" \
+".endif \n\t" \
+ "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \
+".if " #SIZE " == 8 \n\t" \
+ "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \
+".endif \n\t"
+
+
+#define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \
+ int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \
+ int temp0, temp1, temp2, temp3; \
+ __asm__ volatile ( \
+ ".if " #SIZE " < 8 \n\t" \
+ "ulw %[temp0], 0(%[top]) \n\t" \
+ "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
+ CLIPPING(4) \
+ "usw %[temp0], 0(%[dst]) \n\t" \
+ ".else \n\t" \
+ "ulw %[temp0], 0(%[top]) \n\t" \
+ "ulw %[temp1], 4(%[top]) \n\t" \
+ "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
+ CLIPPING(8) \
+ "usw %[temp0], 0(%[dst]) \n\t" \
+ "usw %[temp1], 4(%[dst]) \n\t" \
+ ".if " #SIZE " == 16 \n\t" \
+ "ulw %[temp0], 8(%[top]) \n\t" \
+ "ulw %[temp1], 12(%[top]) \n\t" \
+ CLIPPING(8) \
+ "usw %[temp0], 8(%[dst]) \n\t" \
+ "usw %[temp1], 12(%[dst]) \n\t" \
+ ".endif \n\t" \
+ ".endif \n\t" \
+ : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
+ : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \
+ : "memory" \
+ ); \
+} while (0)
+
+#define CLIP_TO_DST(DST, SIZE) do { \
+ int y; \
+ const uint8_t* top = (DST) - BPS; \
+ const int top_1 = ((int)top[-1] << 16) + top[-1]; \
+ for (y = 0; y < (SIZE); ++y) { \
+ CLIP_8B_TO_DST((DST), top, (SIZE)); \
+ (DST) += BPS; \
+ } \
+} while (0)
+
+#define TRUE_MOTION(DST, SIZE) \
+static void TrueMotion##SIZE(uint8_t* (DST)) { \
+ CLIP_TO_DST((DST), (SIZE)); \
+}
+
+TRUE_MOTION(dst, 4)
+TRUE_MOTION(dst, 8)
+TRUE_MOTION(dst, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
+ VP8TransformDC = TransformDC;
+ VP8TransformAC3 = TransformAC3;
+ VP8Transform = TransformTwo;
+
+ VP8VFilter16 = VFilter16;
+ VP8HFilter16 = HFilter16;
+ VP8VFilter8 = VFilter8;
+ VP8HFilter8 = HFilter8;
+ VP8VFilter16i = VFilter16i;
+ VP8HFilter16i = HFilter16i;
+ VP8VFilter8i = VFilter8i;
+ VP8HFilter8i = HFilter8i;
+ VP8SimpleVFilter16 = SimpleVFilter16;
+ VP8SimpleHFilter16 = SimpleHFilter16;
+ VP8SimpleVFilter16i = SimpleVFilter16i;
+ VP8SimpleHFilter16i = SimpleHFilter16i;
+
+ VP8PredLuma4[0] = DC4;
+ VP8PredLuma4[1] = TrueMotion4;
+ VP8PredLuma4[2] = VE4;
+ VP8PredLuma4[4] = RD4;
+ VP8PredLuma4[6] = LD4;
+
+ VP8PredChroma8[0] = DC8uv;
+ VP8PredChroma8[1] = TrueMotion8;
+ VP8PredChroma8[4] = DC8uvNoTop;
+ VP8PredChroma8[5] = DC8uvNoLeft;
+
+ VP8PredLuma16[1] = TrueMotion16;
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/dec_msa.c b/media/libwebp/dsp/dec_msa.c
new file mode 100644
index 0000000000..5b0b14cc93
--- /dev/null
+++ b/media/libwebp/dsp/dec_msa.c
@@ -0,0 +1,1020 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of dsp functions
+//
+// Author(s): Prashant Patil (prashant.patil@imgtec.com)
+
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) { \
+ v4i32 a1_m, b1_m, c1_m, d1_m; \
+ v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \
+ const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \
+ const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \
+ \
+ a1_m = in0 + in2; \
+ b1_m = in0 - in2; \
+ c_tmp1_m = (in1 * sinpi8sqrt2) >> 16; \
+ c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16); \
+ c1_m = c_tmp1_m - c_tmp2_m; \
+ d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16); \
+ d_tmp2_m = (in3 * sinpi8sqrt2) >> 16; \
+ d1_m = d_tmp1_m + d_tmp2_m; \
+ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+}
+#define MULT1(a) ((((a) * 20091) >> 16) + (a))
+#define MULT2(a) (((a) * 35468) >> 16)
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+ v8i16 input0, input1;
+ v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+ v4i32 res0, res1, res2, res3;
+ const v16i8 zero = { 0 };
+ v16i8 dest0, dest1, dest2, dest3;
+
+ LD_SH2(in, 8, input0, input1);
+ UNPCK_SH_SW(input0, in0, in1);
+ UNPCK_SH_SW(input1, in2, in3);
+ IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+ TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+ IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+ SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+ TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+ LD_SB4(dst, BPS, dest0, dest1, dest2, dest3);
+ ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+ res0, res1, res2, res3);
+ ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+ res0, res1, res2, res3);
+ ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+ CLIP_SW4_0_255(res0, res1, res2, res3);
+ PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+ res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+ ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+ TransformOne(in, dst);
+ if (do_two) {
+ TransformOne(in + 16, dst + 4);
+ }
+}
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+ v8i16 input0, input1;
+ const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+ const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+ const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+ const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+ v8i16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0, out1;
+
+ LD_SH2(in, 8, input0, input1);
+ input1 = SLDI_SH(input1, input1, 8);
+ tmp0 = input0 + input1;
+ tmp1 = input0 - input1;
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+ out0 = tmp2 + tmp3;
+ out1 = tmp2 - tmp3;
+ VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
+ tmp0 = input0 + input1;
+ tmp1 = input0 - input1;
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+ tmp0 = tmp2 + tmp3;
+ tmp1 = tmp2 - tmp3;
+ ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1);
+ SRAI_H2_SH(out0, out1, 3);
+ out[0] = __msa_copy_s_h(out0, 0);
+ out[16] = __msa_copy_s_h(out0, 4);
+ out[32] = __msa_copy_s_h(out1, 0);
+ out[48] = __msa_copy_s_h(out1, 4);
+ out[64] = __msa_copy_s_h(out0, 1);
+ out[80] = __msa_copy_s_h(out0, 5);
+ out[96] = __msa_copy_s_h(out1, 1);
+ out[112] = __msa_copy_s_h(out1, 5);
+ out[128] = __msa_copy_s_h(out0, 2);
+ out[144] = __msa_copy_s_h(out0, 6);
+ out[160] = __msa_copy_s_h(out1, 2);
+ out[176] = __msa_copy_s_h(out1, 6);
+ out[192] = __msa_copy_s_h(out0, 3);
+ out[208] = __msa_copy_s_h(out0, 7);
+ out[224] = __msa_copy_s_h(out1, 3);
+ out[240] = __msa_copy_s_h(out1, 7);
+}
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+ const int DC = (in[0] + 4) >> 3;
+ const v8i16 tmp0 = __msa_fill_h(DC);
+ ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+ const int a = in[0] + 4;
+ const int c4 = MULT2(in[4]);
+ const int d4 = MULT1(in[4]);
+ const int in2 = MULT2(in[1]);
+ const int in3 = MULT1(in[1]);
+ v4i32 tmp0 = { 0 };
+ v4i32 out0 = __msa_fill_w(a + d4);
+ v4i32 out1 = __msa_fill_w(a + c4);
+ v4i32 out2 = __msa_fill_w(a - c4);
+ v4i32 out3 = __msa_fill_w(a - d4);
+ v4i32 res0, res1, res2, res3;
+ const v4i32 zero = { 0 };
+ v16u8 dest0, dest1, dest2, dest3;
+
+ INSERT_W4_SW(in3, in2, -in2, -in3, tmp0);
+ ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0,
+ out0, out1, out2, out3);
+ SRAI_W4_SW(out0, out1, out2, out3, 3);
+ LD_UB4(dst, BPS, dest0, dest1, dest2, dest3);
+ ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+ res0, res1, res2, res3);
+ ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+ res0, res1, res2, res3);
+ ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3);
+ CLIP_SW4_0_255(res0, res1, res2, res3);
+ PCKEV_B2_SW(res0, res1, res2, res3, out0, out1);
+ res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1);
+ ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+//------------------------------------------------------------------------------
+// Edge filtering functions
+
+#define FLIP_SIGN2(in0, in1, out0, out1) { \
+ out0 = (v16i8)__msa_xori_b(in0, 0x80); \
+ out1 = (v16i8)__msa_xori_b(in1, 0x80); \
+}
+
+#define FLIP_SIGN4(in0, in1, in2, in3, out0, out1, out2, out3) { \
+ FLIP_SIGN2(in0, in1, out0, out1); \
+ FLIP_SIGN2(in2, in3, out2, out3); \
+}
+
+#define FILT_VAL(q0_m, p0_m, mask, filt) do { \
+ v16i8 q0_sub_p0; \
+ q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = filt & mask; \
+} while (0)
+
+#define FILT2(q_m, p_m, q, p) do { \
+ u_r = SRAI_H(temp1, 7); \
+ u_r = __msa_sat_s_h(u_r, 7); \
+ u_l = SRAI_H(temp3, 7); \
+ u_l = __msa_sat_s_h(u_l, 7); \
+ u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \
+ q_m = __msa_subs_s_b(q_m, u); \
+ p_m = __msa_adds_s_b(p_m, u); \
+ q = __msa_xori_b((v16u8)q_m, 0x80); \
+ p = __msa_xori_b((v16u8)p_m, 0x80); \
+} while (0)
+
+#define LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) do { \
+ v16i8 p1_m, p0_m, q0_m, q1_m; \
+ v16i8 filt, t1, t2; \
+ const v16i8 cnst4b = __msa_ldi_b(4); \
+ const v16i8 cnst3b = __msa_ldi_b(3); \
+ \
+ FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ filt = filt & hev; \
+ FILT_VAL(q0_m, p0_m, mask, filt); \
+ t1 = __msa_adds_s_b(filt, cnst4b); \
+ t1 = SRAI_B(t1, 3); \
+ t2 = __msa_adds_s_b(filt, cnst3b); \
+ t2 = SRAI_B(t2, 3); \
+ q0_m = __msa_subs_s_b(q0_m, t1); \
+ q0 = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0_m = __msa_adds_s_b(p0_m, t2); \
+ p0 = __msa_xori_b((v16u8)p0_m, 0x80); \
+ filt = __msa_srari_b(t1, 1); \
+ hev = __msa_xori_b(hev, 0xff); \
+ filt = filt & hev; \
+ q1_m = __msa_subs_s_b(q1_m, filt); \
+ q1 = __msa_xori_b((v16u8)q1_m, 0x80); \
+ p1_m = __msa_adds_s_b(p1_m, filt); \
+ p1 = __msa_xori_b((v16u8)p1_m, 0x80); \
+} while (0)
+
+#define LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) do { \
+ v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \
+ v16i8 u, filt, t1, t2, filt_sign; \
+ v8i16 filt_r, filt_l, u_r, u_l; \
+ v8i16 temp0, temp1, temp2, temp3; \
+ const v16i8 cnst4b = __msa_ldi_b(4); \
+ const v16i8 cnst3b = __msa_ldi_b(3); \
+ const v8i16 cnst9h = __msa_ldi_h(9); \
+ const v8i16 cnst63h = __msa_ldi_h(63); \
+ \
+ FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ FILT_VAL(q0_m, p0_m, mask, filt); \
+ FLIP_SIGN2(p2, q2, p2_m, q2_m); \
+ t2 = filt & hev; \
+ /* filt_val &= ~hev */ \
+ hev = __msa_xori_b(hev, 0xff); \
+ filt = filt & hev; \
+ t1 = __msa_adds_s_b(t2, cnst4b); \
+ t1 = SRAI_B(t1, 3); \
+ t2 = __msa_adds_s_b(t2, cnst3b); \
+ t2 = SRAI_B(t2, 3); \
+ q0_m = __msa_subs_s_b(q0_m, t1); \
+ p0_m = __msa_adds_s_b(p0_m, t2); \
+ filt_sign = __msa_clti_s_b(filt, 0); \
+ ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
+ /* update q2/p2 */ \
+ temp0 = filt_r * cnst9h; \
+ temp1 = temp0 + cnst63h; \
+ temp2 = filt_l * cnst9h; \
+ temp3 = temp2 + cnst63h; \
+ FILT2(q2_m, p2_m, q2, p2); \
+ /* update q1/p1 */ \
+ temp1 = temp1 + temp0; \
+ temp3 = temp3 + temp2; \
+ FILT2(q1_m, p1_m, q1, p1); \
+ /* update q0/p0 */ \
+ temp1 = temp1 + temp0; \
+ temp3 = temp3 + temp2; \
+ FILT2(q0_m, p0_m, q0, p0); \
+} while (0)
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
+ q0_in, q1_in, q2_in, q3_in, \
+ limit_in, b_limit_in, thresh_in, \
+ hev_out, mask_out) do { \
+ v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
+ v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
+ v16u8 flat_out; \
+ \
+ /* absolute subtraction of pixel values */ \
+ p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
+ p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
+ p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
+ q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
+ q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
+ q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
+ p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
+ p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
+ /* calculation of hev */ \
+ flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
+ hev_out = (thresh_in < flat_out); \
+ /* calculation of mask */ \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
+ p1_asub_q1_m = SRAI_B(p1_asub_q1_m, 1); \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
+ mask_out = (b_limit_in < p0_asub_q0_m); \
+ mask_out = __msa_max_u_b(flat_out, mask_out); \
+ p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
+ mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
+ q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
+ mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
+ mask_out = (limit_in < mask_out); \
+ mask_out = __msa_xori_b(mask_out, 0xff); \
+} while (0)
+
+#define ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) do { \
+ const uint16_t tmp0_h = __msa_copy_s_h((v8i16)in1, in1_idx); \
+ const uint32_t tmp0_w = __msa_copy_s_w((v4i32)in0, in0_idx); \
+ SW(tmp0_w, pdst); \
+ SH(tmp0_h, pdst + stride); \
+} while (0)
+
+#define ST6x4_UB(in0, start_in0_idx, in1, start_in1_idx, pdst, stride) do { \
+ uint8_t* ptmp1 = (uint8_t*)pdst; \
+ ST6x1_UB(in0, start_in0_idx, in1, start_in1_idx, ptmp1, 4); \
+ ptmp1 += stride; \
+ ST6x1_UB(in0, start_in0_idx + 1, in1, start_in1_idx + 1, ptmp1, 4); \
+ ptmp1 += stride; \
+ ST6x1_UB(in0, start_in0_idx + 2, in1, start_in1_idx + 2, ptmp1, 4); \
+ ptmp1 += stride; \
+ ST6x1_UB(in0, start_in0_idx + 3, in1, start_in1_idx + 3, ptmp1, 4); \
+} while (0)
+
+#define LPF_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) do { \
+ v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \
+ const v16i8 cnst4b = __msa_ldi_b(4); \
+ const v16i8 cnst3b = __msa_ldi_b(3); \
+ \
+ FLIP_SIGN4(p1_in, p0_in, q0_in, q1_in, p1_m, p0_m, q0_m, q1_m); \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ FILT_VAL(q0_m, p0_m, mask, filt); \
+ filt1 = __msa_adds_s_b(filt, cnst4b); \
+ filt1 = SRAI_B(filt1, 3); \
+ filt2 = __msa_adds_s_b(filt, cnst3b); \
+ filt2 = SRAI_B(filt2, 3); \
+ q0_m = __msa_subs_s_b(q0_m, filt1); \
+ p0_m = __msa_adds_s_b(p0_m, filt2); \
+ q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \
+} while (0)
+
+#define LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) do { \
+ v16u8 p1_a_sub_q1, p0_a_sub_q0; \
+ \
+ p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \
+ p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \
+ p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1); \
+ p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \
+ mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \
+ mask = (mask <= b_limit); \
+} while (0)
+
+static void VFilter16(uint8_t* src, int stride,
+ int b_limit_in, int limit_in, int thresh_in) {
+ uint8_t* ptemp = src - 4 * stride;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 mask, hev;
+ const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+ const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+ const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+ LD_UB8(ptemp, stride, p3, p2, p1, p0, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+ hev, mask);
+ LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+ ptemp = src - 3 * stride;
+ ST_UB4(p2, p1, p0, q0, ptemp, stride);
+ ptemp += (4 * stride);
+ ST_UB2(q1, q2, ptemp, stride);
+}
+
+static void HFilter16(uint8_t* src, int stride,
+ int b_limit_in, int limit_in, int thresh_in) {
+ uint8_t* ptmp = src - 4;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 mask, hev;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ v16u8 row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+ const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+ const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+ LD_UB8(ptmp, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+ ptmp += (8 * stride);
+ LD_UB8(ptmp, stride, row8, row9, row10, row11, row12, row13, row14, row15);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+ row8, row9, row10, row11, row12, row13, row14, row15,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+ hev, mask);
+ LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+ ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+ ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+ ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+ ptmp = src - 3;
+ ST6x1_UB(tmp3, 0, tmp2, 0, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp3, 1, tmp2, 1, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp3, 2, tmp2, 2, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp3, 3, tmp2, 3, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp4, 0, tmp2, 4, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp4, 1, tmp2, 5, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp4, 2, tmp2, 6, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp4, 3, tmp2, 7, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp6, 0, tmp5, 0, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp6, 1, tmp5, 1, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp6, 2, tmp5, 2, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp6, 3, tmp5, 3, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp7, 0, tmp5, 4, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp7, 1, tmp5, 5, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp7, 2, tmp5, 6, ptmp, 4);
+ ptmp += stride;
+ ST6x1_UB(tmp7, 3, tmp5, 7, ptmp, 4);
+}
+
+// on three inner edges
+static void VFilterHorEdge16i(uint8_t* src, int stride,
+ int b_limit, int limit, int thresh) {
+ v16u8 mask, hev;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+ const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+ const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+ LD_UB8((src - 4 * stride), stride, p3, p2, p1, p0, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+ hev, mask);
+ LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+ ST_UB4(p1, p0, q0, q1, (src - 2 * stride), stride);
+}
+
+static void VFilter16i(uint8_t* src_y, int stride,
+ int b_limit, int limit, int thresh) {
+ VFilterHorEdge16i(src_y + 4 * stride, stride, b_limit, limit, thresh);
+ VFilterHorEdge16i(src_y + 8 * stride, stride, b_limit, limit, thresh);
+ VFilterHorEdge16i(src_y + 12 * stride, stride, b_limit, limit, thresh);
+}
+
+static void HFilterVertEdge16i(uint8_t* src, int stride,
+ int b_limit, int limit, int thresh) {
+ v16u8 mask, hev;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+ v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+ const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+ const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+ LD_UB8(src - 4, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+ LD_UB8(src - 4 + (8 * stride), stride,
+ row8, row9, row10, row11, row12, row13, row14, row15);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+ row8, row9, row10, row11, row12, row13, row14, row15,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+ hev, mask);
+ LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+ ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+ ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+ src -= 2;
+ ST4x8_UB(tmp2, tmp3, src, stride);
+ src += (8 * stride);
+ ST4x8_UB(tmp4, tmp5, src, stride);
+}
+
+static void HFilter16i(uint8_t* src_y, int stride,
+ int b_limit, int limit, int thresh) {
+ HFilterVertEdge16i(src_y + 4, stride, b_limit, limit, thresh);
+ HFilterVertEdge16i(src_y + 8, stride, b_limit, limit, thresh);
+ HFilterVertEdge16i(src_y + 12, stride, b_limit, limit, thresh);
+}
+
+// 8-pixels wide variants, for chroma filtering
+static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+ int b_limit_in, int limit_in, int thresh_in) {
+ uint8_t* ptmp_src_u = src_u - 4 * stride;
+ uint8_t* ptmp_src_v = src_v - 4 * stride;
+ uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+ v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+ v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+ const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+ const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+ const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+ LD_UB8(ptmp_src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+ LD_UB8(ptmp_src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+ ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+ ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+ hev, mask);
+ LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+ p2_d = __msa_copy_s_d((v2i64)p2, 0);
+ p1_d = __msa_copy_s_d((v2i64)p1, 0);
+ p0_d = __msa_copy_s_d((v2i64)p0, 0);
+ q0_d = __msa_copy_s_d((v2i64)q0, 0);
+ q1_d = __msa_copy_s_d((v2i64)q1, 0);
+ q2_d = __msa_copy_s_d((v2i64)q2, 0);
+ ptmp_src_u += stride;
+ SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_u, stride);
+ ptmp_src_u += (4 * stride);
+ SD(q1_d, ptmp_src_u);
+ ptmp_src_u += stride;
+ SD(q2_d, ptmp_src_u);
+ p2_d = __msa_copy_s_d((v2i64)p2, 1);
+ p1_d = __msa_copy_s_d((v2i64)p1, 1);
+ p0_d = __msa_copy_s_d((v2i64)p0, 1);
+ q0_d = __msa_copy_s_d((v2i64)q0, 1);
+ q1_d = __msa_copy_s_d((v2i64)q1, 1);
+ q2_d = __msa_copy_s_d((v2i64)q2, 1);
+ ptmp_src_v += stride;
+ SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_v, stride);
+ ptmp_src_v += (4 * stride);
+ SD(q1_d, ptmp_src_v);
+ ptmp_src_v += stride;
+ SD(q2_d, ptmp_src_v);
+}
+
+static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+ int b_limit_in, int limit_in, int thresh_in) {
+ uint8_t* ptmp_src_u = src_u - 4;
+ uint8_t* ptmp_src_v = src_v - 4;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ v16u8 row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+ const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+ const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+ LD_UB8(ptmp_src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+ LD_UB8(ptmp_src_v, stride,
+ row8, row9, row10, row11, row12, row13, row14, row15);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+ row8, row9, row10, row11, row12, row13, row14, row15,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+ hev, mask);
+ LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+ ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+ ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+ ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+ ptmp_src_u += 1;
+ ST6x4_UB(tmp3, 0, tmp2, 0, ptmp_src_u, stride);
+ ptmp_src_u += 4 * stride;
+ ST6x4_UB(tmp4, 0, tmp2, 4, ptmp_src_u, stride);
+ ptmp_src_v += 1;
+ ST6x4_UB(tmp6, 0, tmp5, 0, ptmp_src_v, stride);
+ ptmp_src_v += 4 * stride;
+ ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
+}
+
+static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+ int b_limit_in, int limit_in, int thresh_in) {
+ uint64_t p1_d, p0_d, q0_d, q1_d;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+ v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+ v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+ const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+ const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+ const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+ LD_UB8(src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+ src_u += (5 * stride);
+ LD_UB8(src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+ src_v += (5 * stride);
+ ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+ ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+ hev, mask);
+ LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+ p1_d = __msa_copy_s_d((v2i64)p1, 0);
+ p0_d = __msa_copy_s_d((v2i64)p0, 0);
+ q0_d = __msa_copy_s_d((v2i64)q0, 0);
+ q1_d = __msa_copy_s_d((v2i64)q1, 0);
+ SD4(q1_d, q0_d, p0_d, p1_d, src_u, -stride);
+ p1_d = __msa_copy_s_d((v2i64)p1, 1);
+ p0_d = __msa_copy_s_d((v2i64)p0, 1);
+ q0_d = __msa_copy_s_d((v2i64)q0, 1);
+ q1_d = __msa_copy_s_d((v2i64)q1, 1);
+ SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
+}
+
+static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+ int b_limit_in, int limit_in, int thresh_in) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ v16u8 row9, row10, row11, row12, row13, row14, row15;
+ v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+ const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+ const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+ LD_UB8(src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+ LD_UB8(src_v, stride,
+ row8, row9, row10, row11, row12, row13, row14, row15);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+ row8, row9, row10, row11, row12, row13, row14, row15,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+ hev, mask);
+ LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+ ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+ ILVL_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+ src_u += 2;
+ ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, src_u, stride);
+ src_u += 4 * stride;
+ ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src_u, stride);
+ src_v += 2;
+ ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src_v, stride);
+ src_v += 4 * stride;
+ ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, src_v, stride);
+}
+
+static void SimpleVFilter16(uint8_t* src, int stride, int b_limit_in) {
+ v16u8 p1, p0, q1, q0, mask;
+ const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+ LD_UB4(src - 2 * stride, stride, p1, p0, q0, q1);
+ LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+ LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+ ST_UB2(p0, q0, src - stride, stride);
+}
+
+static void SimpleHFilter16(uint8_t* src, int stride, int b_limit_in) {
+ v16u8 p1, p0, q1, q0, mask, row0, row1, row2, row3, row4, row5, row6, row7;
+ v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1;
+ const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+ uint8_t* ptemp_src = src - 2;
+
+ LD_UB8(ptemp_src, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+ LD_UB8(ptemp_src + 8 * stride, stride,
+ row8, row9, row10, row11, row12, row13, row14, row15);
+ TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+ row8, row9, row10, row11, row12, row13, row14, row15,
+ p1, p0, q0, q1);
+ LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+ LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+ ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+ ptemp_src += 1;
+ ST2x4_UB(tmp1, 0, ptemp_src, stride);
+ ptemp_src += 4 * stride;
+ ST2x4_UB(tmp1, 4, ptemp_src, stride);
+ ptemp_src += 4 * stride;
+ ST2x4_UB(tmp0, 0, ptemp_src, stride);
+ ptemp_src += 4 * stride;
+ ST2x4_UB(tmp0, 4, ptemp_src, stride);
+ ptemp_src += 4 * stride;
+}
+
+static void SimpleVFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+ SimpleVFilter16(src_y + 4 * stride, stride, b_limit_in);
+ SimpleVFilter16(src_y + 8 * stride, stride, b_limit_in);
+ SimpleVFilter16(src_y + 12 * stride, stride, b_limit_in);
+}
+
+static void SimpleHFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+ SimpleHFilter16(src_y + 4, stride, b_limit_in);
+ SimpleHFilter16(src_y + 8, stride, b_limit_in);
+ SimpleHFilter16(src_y + 12, stride, b_limit_in);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+//------------------------------------------------------------------------------
+
+// 4x4
+
+static void DC4(uint8_t* dst) { // DC
+ uint32_t dc = 4;
+ int i;
+ for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
+ dc >>= 3;
+ dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+ SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static void TM4(uint8_t* dst) {
+ const uint8_t* const ptemp = dst - BPS - 1;
+ v8i16 T, d, r0, r1, r2, r3;
+ const v16i8 zero = { 0 };
+ const v8i16 TL = (v8i16)__msa_fill_h(ptemp[0 * BPS]);
+ const v8i16 L0 = (v8i16)__msa_fill_h(ptemp[1 * BPS]);
+ const v8i16 L1 = (v8i16)__msa_fill_h(ptemp[2 * BPS]);
+ const v8i16 L2 = (v8i16)__msa_fill_h(ptemp[3 * BPS]);
+ const v8i16 L3 = (v8i16)__msa_fill_h(ptemp[4 * BPS]);
+ const v16u8 T1 = LD_UB(ptemp + 1);
+
+ T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+ d = T - TL;
+ ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+ CLIP_SH4_0_255(r0, r1, r2, r3);
+ PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+static void VE4(uint8_t* dst) { // vertical
+ const uint8_t* const ptop = dst - BPS - 1;
+ const uint32_t val0 = LW(ptop + 0);
+ const uint32_t val1 = LW(ptop + 4);
+ uint32_t out;
+ v16u8 A = { 0 }, B, C, AC, B2, R;
+
+ INSERT_W2_UB(val0, val1, A);
+ B = SLDI_UB(A, A, 1);
+ C = SLDI_UB(A, A, 2);
+ AC = __msa_ave_u_b(A, C);
+ B2 = __msa_ave_u_b(B, B);
+ R = __msa_aver_u_b(AC, B2);
+ out = __msa_copy_s_w((v4i32)R, 0);
+ SW4(out, out, out, out, dst, BPS);
+}
+
+static void RD4(uint8_t* dst) { // Down-right
+ const uint8_t* const ptop = dst - 1 - BPS;
+ uint32_t val0 = LW(ptop + 0);
+ uint32_t val1 = LW(ptop + 4);
+ uint32_t val2, val3;
+ v16u8 A, B, C, AC, B2, R, A1 = { 0 };
+
+ INSERT_W2_UB(val0, val1, A1);
+ A = SLDI_UB(A1, A1, 12);
+ A = (v16u8)__msa_insert_b((v16i8)A, 3, ptop[1 * BPS]);
+ A = (v16u8)__msa_insert_b((v16i8)A, 2, ptop[2 * BPS]);
+ A = (v16u8)__msa_insert_b((v16i8)A, 1, ptop[3 * BPS]);
+ A = (v16u8)__msa_insert_b((v16i8)A, 0, ptop[4 * BPS]);
+ B = SLDI_UB(A, A, 1);
+ C = SLDI_UB(A, A, 2);
+ AC = __msa_ave_u_b(A, C);
+ B2 = __msa_ave_u_b(B, B);
+ R = __msa_aver_u_b(AC, B2);
+ val3 = __msa_copy_s_w((v4i32)R, 0);
+ R = SLDI_UB(R, R, 1);
+ val2 = __msa_copy_s_w((v4i32)R, 0);
+ R = SLDI_UB(R, R, 1);
+ val1 = __msa_copy_s_w((v4i32)R, 0);
+ R = SLDI_UB(R, R, 1);
+ val0 = __msa_copy_s_w((v4i32)R, 0);
+ SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static void LD4(uint8_t* dst) { // Down-Left
+ const uint8_t* const ptop = dst - BPS;
+ uint32_t val0 = LW(ptop + 0);
+ uint32_t val1 = LW(ptop + 4);
+ uint32_t val2, val3;
+ v16u8 A = { 0 }, B, C, AC, B2, R;
+
+ INSERT_W2_UB(val0, val1, A);
+ B = SLDI_UB(A, A, 1);
+ C = SLDI_UB(A, A, 2);
+ C = (v16u8)__msa_insert_b((v16i8)C, 6, ptop[7]);
+ AC = __msa_ave_u_b(A, C);
+ B2 = __msa_ave_u_b(B, B);
+ R = __msa_aver_u_b(AC, B2);
+ val0 = __msa_copy_s_w((v4i32)R, 0);
+ R = SLDI_UB(R, R, 1);
+ val1 = __msa_copy_s_w((v4i32)R, 0);
+ R = SLDI_UB(R, R, 1);
+ val2 = __msa_copy_s_w((v4i32)R, 0);
+ R = SLDI_UB(R, R, 1);
+ val3 = __msa_copy_s_w((v4i32)R, 0);
+ SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+// 16x16
+
+static void DC16(uint8_t* dst) { // DC
+ uint32_t dc = 16;
+ int i;
+ const v16u8 rtop = LD_UB(dst - BPS);
+ const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+ v16u8 out;
+
+ for (i = 0; i < 16; ++i) {
+ dc += dst[-1 + i * BPS];
+ }
+ dc += HADD_UH_U32(dctop);
+ out = (v16u8)__msa_fill_b(dc >> 5);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+ ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void TM16(uint8_t* dst) {
+ int j;
+ v8i16 d1, d2;
+ const v16i8 zero = { 0 };
+ const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+ const v16i8 T = LD_SB(dst - BPS);
+
+ ILVRL_B2_SH(zero, T, d1, d2);
+ SUB2(d1, TL, d2, TL, d1, d2);
+ for (j = 0; j < 16; j += 4) {
+ v16i8 t0, t1, t2, t3;
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+ const v8i16 L0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+ const v8i16 L1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+ const v8i16 L2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+ const v8i16 L3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+ ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+ ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+ CLIP_SH4_0_255(r0, r1, r2, r3);
+ CLIP_SH4_0_255(r4, r5, r6, r7);
+ PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+ ST_SB4(t0, t1, t2, t3, dst, BPS);
+ dst += 4 * BPS;
+ }
+}
+
+static void VE16(uint8_t* dst) { // vertical
+ const v16u8 rtop = LD_UB(dst - BPS);
+ ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst, BPS);
+ ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst + 8 * BPS, BPS);
+}
+
+static void HE16(uint8_t* dst) { // horizontal
+ int j;
+ for (j = 16; j > 0; j -= 4) {
+ const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+ const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+ const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+ const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+ ST_UB4(L0, L1, L2, L3, dst, BPS);
+ dst += 4 * BPS;
+ }
+}
+
+static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
+ int j;
+ uint32_t dc = 8;
+ v16u8 out;
+
+ for (j = 0; j < 16; ++j) {
+ dc += dst[-1 + j * BPS];
+ }
+ out = (v16u8)__msa_fill_b(dc >> 4);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+ ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
+ uint32_t dc = 8;
+ const v16u8 rtop = LD_UB(dst - BPS);
+ const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+ v16u8 out;
+
+ dc += HADD_UH_U32(dctop);
+ out = (v16u8)__msa_fill_b(dc >> 4);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+ ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) { // DC with nothing
+ const v16u8 out = (v16u8)__msa_fill_b(0x80);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+ ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+// Chroma
+
+#define STORE8x8(out, dst) do { \
+ SD4(out, out, out, out, dst + 0 * BPS, BPS); \
+ SD4(out, out, out, out, dst + 4 * BPS, BPS); \
+} while (0)
+
+static void DC8uv(uint8_t* dst) { // DC
+ uint32_t dc = 8;
+ int i;
+ uint64_t out;
+ const v16u8 rtop = LD_UB(dst - BPS);
+ const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+ const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+ const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+ v16u8 dctemp;
+
+ for (i = 0; i < 8; ++i) {
+ dc += dst[-1 + i * BPS];
+ }
+ dc += __msa_copy_s_w((v4i32)temp2, 0);
+ dctemp = (v16u8)__msa_fill_b(dc >> 4);
+ out = __msa_copy_s_d((v2i64)dctemp, 0);
+ STORE8x8(out, dst);
+}
+
+static void TM8uv(uint8_t* dst) {
+ int j;
+ const v16i8 T1 = LD_SB(dst - BPS);
+ const v16i8 zero = { 0 };
+ const v8i16 T = (v8i16)__msa_ilvr_b(zero, T1);
+ const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+ const v8i16 d = T - TL;
+
+ for (j = 0; j < 8; j += 4) {
+ v16i8 t0, t1;
+ v8i16 r0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+ v8i16 r1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+ v8i16 r2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+ v8i16 r3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+ ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+ CLIP_SH4_0_255(r0, r1, r2, r3);
+ PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+ ST4x4_UB(t0, t1, 0, 2, 0, 2, dst, BPS);
+ ST4x4_UB(t0, t1, 1, 3, 1, 3, dst + 4, BPS);
+ dst += 4 * BPS;
+ }
+}
+
+static void VE8uv(uint8_t* dst) { // vertical
+ const v16u8 rtop = LD_UB(dst - BPS);
+ const uint64_t out = __msa_copy_s_d((v2i64)rtop, 0);
+ STORE8x8(out, dst);
+}
+
+static void HE8uv(uint8_t* dst) { // horizontal
+ int j;
+ for (j = 0; j < 8; j += 4) {
+ const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+ const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+ const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+ const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+ const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+ const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+ const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+ const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+ SD4(out0, out1, out2, out3, dst, BPS);
+ dst += 4 * BPS;
+ }
+}
+
+static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
+ const uint32_t dc = 4;
+ const v16u8 rtop = LD_UB(dst - BPS);
+ const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+ const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+ const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+ const uint32_t sum_m = __msa_copy_s_w((v4i32)temp2, 0);
+ const v16u8 dcval = (v16u8)__msa_fill_b((dc + sum_m) >> 3);
+ const uint64_t out = __msa_copy_s_d((v2i64)dcval, 0);
+ STORE8x8(out, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
+ uint32_t dc = 4;
+ int i;
+ uint64_t out;
+ v16u8 dctemp;
+
+ for (i = 0; i < 8; ++i) {
+ dc += dst[-1 + i * BPS];
+ }
+ dctemp = (v16u8)__msa_fill_b(dc >> 3);
+ out = __msa_copy_s_d((v2i64)dctemp, 0);
+ STORE8x8(out, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
+ const uint64_t out = 0x8080808080808080ULL;
+ STORE8x8(out, dst);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
+ VP8TransformWHT = TransformWHT;
+ VP8Transform = TransformTwo;
+ VP8TransformDC = TransformDC;
+ VP8TransformAC3 = TransformAC3;
+
+ VP8VFilter16 = VFilter16;
+ VP8HFilter16 = HFilter16;
+ VP8VFilter16i = VFilter16i;
+ VP8HFilter16i = HFilter16i;
+ VP8VFilter8 = VFilter8;
+ VP8HFilter8 = HFilter8;
+ VP8VFilter8i = VFilter8i;
+ VP8HFilter8i = HFilter8i;
+ VP8SimpleVFilter16 = SimpleVFilter16;
+ VP8SimpleHFilter16 = SimpleHFilter16;
+ VP8SimpleVFilter16i = SimpleVFilter16i;
+ VP8SimpleHFilter16i = SimpleHFilter16i;
+
+ VP8PredLuma4[0] = DC4;
+ VP8PredLuma4[1] = TM4;
+ VP8PredLuma4[2] = VE4;
+ VP8PredLuma4[4] = RD4;
+ VP8PredLuma4[6] = LD4;
+ VP8PredLuma16[0] = DC16;
+ VP8PredLuma16[1] = TM16;
+ VP8PredLuma16[2] = VE16;
+ VP8PredLuma16[3] = HE16;
+ VP8PredLuma16[4] = DC16NoTop;
+ VP8PredLuma16[5] = DC16NoLeft;
+ VP8PredLuma16[6] = DC16NoTopLeft;
+ VP8PredChroma8[0] = DC8uv;
+ VP8PredChroma8[1] = TM8uv;
+ VP8PredChroma8[2] = VE8uv;
+ VP8PredChroma8[3] = HE8uv;
+ VP8PredChroma8[4] = DC8uvNoTop;
+ VP8PredChroma8[5] = DC8uvNoLeft;
+ VP8PredChroma8[6] = DC8uvNoTopLeft;
+}
+
+#else // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8DspInitMSA)
+
+#endif // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/dec_neon.c b/media/libwebp/dsp/dec_neon.c
index e8341327e4..ea690646d6 100644
--- a/media/libwebp/dsp/dec_neon.c
+++ b/media/libwebp/dsp/dec_neon.c
@@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) { // DC
const uint8x8_t A = vld1_u8(dst - BPS); // top row
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0);
- const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
- const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
- const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
- const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
- const uint16x8_t s0 = vaddq_u16(L0, L1);
- const uint16x8_t s1 = vaddq_u16(L2, L3);
+ const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+ const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+ const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+ const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+ const uint16x8_t s0 = vaddl_u8(L0, L1);
+ const uint16x8_t s1 = vaddl_u8(L2, L3);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
@@ -1361,7 +1361,8 @@ static void RD4_NEON(uint8_t* dst) { // Down-right
const uint32_t J = dst[-1 + 1 * BPS];
const uint32_t K = dst[-1 + 2 * BPS];
const uint32_t L = dst[-1 + 3 * BPS];
- const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
+ const uint64x1_t LKJI____ =
+ vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
@@ -1427,25 +1428,30 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) {
const uint8x8_t A = vld1_u8(dst - BPS); // top row
+#if defined(__aarch64__)
+ const uint16_t p2 = vaddlv_u8(A);
+ sum_top = vdupq_n_u16(p2);
+#else
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0);
const uint16x4_t p2 = vpadd_u16(p1, p1);
sum_top = vcombine_u16(p2, p2);
+#endif
}
if (do_left) {
- const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
- const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
- const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
- const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
- const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
- const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
- const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
- const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
- const uint16x8_t s0 = vaddq_u16(L0, L1);
- const uint16x8_t s1 = vaddq_u16(L2, L3);
- const uint16x8_t s2 = vaddq_u16(L4, L5);
- const uint16x8_t s3 = vaddq_u16(L6, L7);
+ const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+ const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+ const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+ const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+ const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
+ const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
+ const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
+ const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
+ const uint16x8_t s0 = vaddl_u8(L0, L1);
+ const uint16x8_t s1 = vaddl_u8(L2, L3);
+ const uint16x8_t s2 = vaddl_u8(L4, L5);
+ const uint16x8_t s3 = vaddl_u8(L6, L7);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t s23 = vaddq_u16(s2, s3);
sum_left = vaddq_u16(s01, s23);
@@ -1505,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) {
const uint8x16_t A = vld1q_u8(dst - BPS); // top row
+#if defined(__aarch64__)
+ const uint16_t p3 = vaddlvq_u8(A);
+ sum_top = vdupq_n_u16(p3);
+#else
const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
const uint16x4_t p2 = vpadd_u16(p1, p1);
const uint16x4_t p3 = vpadd_u16(p2, p2);
sum_top = vcombine_u16(p3, p3);
+#endif
}
if (do_left) {
int i;
sum_left = vdupq_n_u16(0);
for (i = 0; i < 16; i += 8) {
- const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
- const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
- const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
- const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
- const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
- const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
- const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
- const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
- const uint16x8_t s0 = vaddq_u16(L0, L1);
- const uint16x8_t s1 = vaddq_u16(L2, L3);
- const uint16x8_t s2 = vaddq_u16(L4, L5);
- const uint16x8_t s3 = vaddq_u16(L6, L7);
+ const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
+ const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
+ const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
+ const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
+ const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
+ const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
+ const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
+ const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
+ const uint16x8_t s0 = vaddl_u8(L0, L1);
+ const uint16x8_t s1 = vaddl_u8(L2, L3);
+ const uint16x8_t s2 = vaddl_u8(L4, L5);
+ const uint16x8_t s3 = vaddl_u8(L6, L7);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t s23 = vaddq_u16(s2, s3);
const uint16x8_t sum = vaddq_u16(s01, s23);
diff --git a/media/libwebp/dsp/dec_sse2.c b/media/libwebp/dsp/dec_sse2.c
index f187a5bb48..b90c082793 100644
--- a/media/libwebp/dsp/dec_sse2.c
+++ b/media/libwebp/dsp/dec_sse2.c
@@ -326,7 +326,7 @@ static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
- const __m128i sign_bit = _mm_set1_epi8(0x80);
+ const __m128i sign_bit = _mm_set1_epi8((char)0x80);
*pi = _mm_adds_epi8(*pi, delta);
*qi = _mm_subs_epi8(*qi, delta);
FLIP_SIGN_BIT2(*pi, *qi);
@@ -338,9 +338,9 @@ static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
const __m128i* const q0,
const __m128i* const q1,
int thresh, __m128i* const mask) {
- const __m128i m_thresh = _mm_set1_epi8(thresh);
+ const __m128i m_thresh = _mm_set1_epi8((char)thresh);
const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
- const __m128i kFE = _mm_set1_epi8(0xFE);
+ const __m128i kFE = _mm_set1_epi8((char)0xFE);
const __m128i t2 = _mm_and_si128(t1, kFE); // set lsb of each byte to zero
const __m128i t3 = _mm_srli_epi16(t2, 1); // abs(p1 - q1) / 2
@@ -360,7 +360,7 @@ static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1,
int thresh) {
__m128i a, mask;
- const __m128i sign_bit = _mm_set1_epi8(0x80);
+ const __m128i sign_bit = _mm_set1_epi8((char)0x80);
// convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
@@ -380,7 +380,7 @@ static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
const __m128i* const mask,
int hev_thresh) {
const __m128i zero = _mm_setzero_si128();
- const __m128i sign_bit = _mm_set1_epi8(0x80);
+ const __m128i sign_bit = _mm_set1_epi8((char)0x80);
const __m128i k64 = _mm_set1_epi8(64);
const __m128i k3 = _mm_set1_epi8(3);
const __m128i k4 = _mm_set1_epi8(4);
@@ -427,7 +427,7 @@ static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
const __m128i* const mask,
int hev_thresh) {
const __m128i zero = _mm_setzero_si128();
- const __m128i sign_bit = _mm_set1_epi8(0x80);
+ const __m128i sign_bit = _mm_set1_epi8((char)0x80);
__m128i a, not_hev;
// compute hev mask
@@ -941,7 +941,7 @@ static void VR4_SSE2(uint8_t* dst) { // Vertical-Right
const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
- const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
+ const __m128i IXABCD = _mm_insert_epi16(_XABCD, (short)(I | (X << 8)), 0);
const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
diff --git a/media/libwebp/dsp/dsp.h b/media/libwebp/dsp/dsp.h
index 4e509bd2ca..ce1679ea53 100644
--- a/media/libwebp/dsp/dsp.h
+++ b/media/libwebp/dsp/dsp.h
@@ -27,6 +27,23 @@ extern "C" {
#define BPS 32 // this is the common stride for enc/dec
//------------------------------------------------------------------------------
+// WEBP_RESTRICT
+
+// Declares a pointer with the restrict type qualifier if available.
+// This allows code to hint to the compiler that only this pointer references a
+// particular object or memory region within the scope of the block in which it
+// is declared. This may allow for improved optimizations due to the lack of
+// pointer aliasing. See also:
+// https://en.cppreference.com/w/c/language/restrict
+#if defined(__GNUC__)
+#define WEBP_RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define WEBP_RESTRICT __restrict
+#else
+#define WEBP_RESTRICT
+#endif
+
+//------------------------------------------------------------------------------
// CPU detection
#if defined(__GNUC__)
@@ -51,9 +68,7 @@ extern "C" {
# define __has_builtin(x) 0
#endif
-// for now, none of the optimizations below are available in emscripten
-#if !defined(EMSCRIPTEN)
-
+#if !defined(HAVE_CONFIG_H)
#if defined(_MSC_VER) && _MSC_VER > 1310 && \
(defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
@@ -63,23 +78,37 @@ extern "C" {
(defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets
#endif
+#endif
// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
// files without intrinsics, allowing the corresponding Init() to be called.
// Files containing intrinsics will need to be built targeting the instruction
// set so should succeed on one of the earlier tests.
-#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
+#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
+ (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
#define WEBP_USE_SSE2
#endif
-#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
+#define WEBP_HAVE_SSE2
+#endif
+
+#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
+ (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
#define WEBP_USE_SSE41
#endif
+#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
+#define WEBP_HAVE_SSE41
+#endif
+
+#undef WEBP_MSC_SSE41
+#undef WEBP_MSC_SSE2
+
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
// inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || \
- defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
+#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \
+ (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
!defined(__native_client__)
#define WEBP_USE_NEON
#endif
@@ -90,11 +119,20 @@ extern "C" {
#define WEBP_USE_NEON
#endif
-#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
+// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
+// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
+// arm_neon.h.
+#if defined(_MSC_VER) && \
+ ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+ (_MSC_VER >= 1920 && defined(_M_ARM64)))
#define WEBP_USE_NEON
#define WEBP_USE_INTRINSICS
#endif
+#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
+#define WEBP_HAVE_NEON
+#endif
+
#if defined(__mips__) && !defined(__mips64) && \
defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
#define WEBP_USE_MIPS32
@@ -110,13 +148,11 @@ extern "C" {
#define WEBP_USE_MSA
#endif
-#endif /* EMSCRIPTEN */
-
#ifndef WEBP_DSP_OMIT_C_CODE
#define WEBP_DSP_OMIT_C_CODE 1
#endif
-#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
+#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
#define WEBP_NEON_OMIT_C_CODE 1
#else
#define WEBP_NEON_OMIT_C_CODE 0
@@ -193,6 +229,12 @@ extern "C" {
#endif
#endif
+// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
+// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
+#if !defined(WEBP_OFFSET_PTR)
+#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
+#endif
+
// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
#if !defined(WEBP_SWAP_16BIT_CSP)
#define WEBP_SWAP_16BIT_CSP 0
@@ -246,9 +288,9 @@ extern VP8Fdct VP8FTransform2; // performs two transforms at a time
extern VP8WHT VP8FTransformWHT;
// Predictions
// *dst is the destination block. *top and *left can be NULL.
-typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
+typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
const uint8_t* top);
-typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
+typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top);
extern VP8Intra4Preds VP8EncPredLuma4;
extern VP8IntraPreds VP8EncPredLuma16;
extern VP8IntraPreds VP8EncPredChroma8;
@@ -572,26 +614,29 @@ extern void (*WebPApplyAlphaMultiply4444)(
// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
// Returns true if alpha[] plane has non-trivial values different from 0xff.
-extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint8_t* dst, int dst_stride);
+extern int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT dst, int dst_stride);
// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
-extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint32_t* dst, int dst_stride);
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint32_t* WEBP_RESTRICT dst,
+ int dst_stride);
// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
// (this is the opposite of WebPDispatchAlpha).
// Returns true if there's only trivial 0xff alpha values.
-extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
- int width, int height,
- uint8_t* alpha, int alpha_stride);
+extern int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT argb,
+ int argb_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride);
// Extract the green values from 32b values in argb[] and pack them into alpha[]
// (this is the opposite of WebPDispatchAlphaToGreen).
-extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+extern void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+ uint8_t* WEBP_RESTRICT alpha, int size);
// Pre-Multiply operation transforms x into x * A / 255 (where x=Y,R,G or B).
// Un-Multiply operation transforms x into x * 255 / A.
@@ -604,34 +649,42 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
int inverse);
// Same for a row of single values, with side alpha values.
-extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+extern void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse);
// Same a WebPMultRow(), but for several 'num_rows' rows.
-void WebPMultRows(uint8_t* ptr, int stride,
- const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+ const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
int width, int num_rows, int inverse);
// Plain-C versions, used as fallback by some implementations.
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse);
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
#ifdef WORDS_BIGENDIAN
// ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
- const uint8_t* g, const uint8_t* b, int len,
- uint32_t* out);
+extern void (*WebPPackARGB)(const uint8_t* WEBP_RESTRICT a,
+ const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, uint32_t* WEBP_RESTRICT out);
#endif
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
- int len, int step, uint32_t* out);
+extern void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, int step, uint32_t* WEBP_RESTRICT out);
// This function returns true if src[i] contains a value different from 0xff.
extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
// This function returns true if src[4*i] contains a value different from 0xff.
extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+// replaces transparent values in src[] by 'color'.
+extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
// To be called first before using the above.
void WebPInitAlphaProcessing(void);
diff --git a/media/libwebp/dsp/enc.c b/media/libwebp/dsp/enc.c
new file mode 100644
index 0000000000..69a2f5c577
--- /dev/null
+++ b/media/libwebp/dsp/enc.c
@@ -0,0 +1,830 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h> // for abs()
+
+#include "../dsp/dsp.h"
+#include "../enc/vp8i_enc.h"
+
+static WEBP_INLINE uint8_t clip_8b(int v) {
+ return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int clip_max(int v, int max) {
+ return (v > max) ? max : v;
+}
+#endif // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+const int VP8DspScan[16 + 4 + 4] = {
+ // Luma
+ 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
+ 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
+ 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
+ 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+
+ 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
+ 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
+};
+
+// general-purpose util function
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+ VP8Histogram* const histo) {
+ int max_value = 0, last_non_zero = 1;
+ int k;
+ for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
+ const int value = distribution[k];
+ if (value > 0) {
+ if (value > max_value) max_value = value;
+ last_non_zero = k;
+ }
+ }
+ histo->max_value = max_value;
+ histo->last_non_zero = last_non_zero;
+}
+
+#if !WEBP_NEON_OMIT_C_CODE
+static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
+ int j;
+ int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+ for (j = start_block; j < end_block; ++j) {
+ int k;
+ int16_t out[16];
+
+ VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+ // Convert coefficients to bin.
+ for (k = 0; k < 16; ++k) {
+ const int v = abs(out[k]) >> 3;
+ const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
+ ++distribution[clipped_value];
+ }
+ }
+ VP8SetHistogramData(distribution, histo);
+}
+#endif // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+// run-time tables (~4k)
+
+static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
+
+// We declare this variable 'volatile' to prevent instruction reordering
+// and make sure it's set to true _last_ (so as to be thread-safe)
+static volatile int tables_ok = 0;
+
+static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
+ if (!tables_ok) {
+ int i;
+ for (i = -255; i <= 255 + 255; ++i) {
+ clip1[255 + i] = clip_8b(i);
+ }
+ tables_ok = 1;
+ }
+}
+
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+#if !WEBP_NEON_OMIT_C_CODE
+
+#define STORE(x, y, v) \
+ dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+ uint8_t* dst) {
+ int C[4 * 4], *tmp;
+ int i;
+ tmp = C;
+ for (i = 0; i < 4; ++i) { // vertical pass
+ const int a = in[0] + in[8];
+ const int b = in[0] - in[8];
+ const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
+ const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
+ tmp[0] = a + d;
+ tmp[1] = b + c;
+ tmp[2] = b - c;
+ tmp[3] = a - d;
+ tmp += 4;
+ in++;
+ }
+
+ tmp = C;
+ for (i = 0; i < 4; ++i) { // horizontal pass
+ const int dc = tmp[0] + 4;
+ const int a = dc + tmp[8];
+ const int b = dc - tmp[8];
+ const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
+ const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+ STORE(0, i, a + d);
+ STORE(1, i, b + c);
+ STORE(2, i, b - c);
+ STORE(3, i, a - d);
+ tmp++;
+ }
+}
+
+static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+ int do_two) {
+ ITransformOne(ref, in, dst);
+ if (do_two) {
+ ITransformOne(ref + 4, in + 16, dst + 4);
+ }
+}
+
+static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+ int i;
+ int tmp[16];
+ for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
+ const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255])
+ const int d1 = src[1] - ref[1];
+ const int d2 = src[2] - ref[2];
+ const int d3 = src[3] - ref[3];
+ const int a0 = (d0 + d3); // 10b [-510,510]
+ const int a1 = (d1 + d2);
+ const int a2 = (d1 - d2);
+ const int a3 = (d0 - d3);
+ tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160]
+ tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542]
+ tmp[2 + i * 4] = (a0 - a1) * 8;
+ tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9;
+ }
+ for (i = 0; i < 4; ++i) {
+ const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b
+ const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
+ const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
+ const int a3 = (tmp[0 + i] - tmp[12 + i]);
+ out[0 + i] = (a0 + a1 + 7) >> 4; // 12b
+ out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
+ out[8 + i] = (a0 - a1 + 7) >> 4;
+ out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
+ }
+}
+#endif // !WEBP_NEON_OMIT_C_CODE
+
+static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
+ VP8FTransform(src, ref, out);
+ VP8FTransform(src + 4, ref + 4, out + 16);
+}
+
+#if !WEBP_NEON_OMIT_C_CODE
+static void FTransformWHT_C(const int16_t* in, int16_t* out) {
+ // input is 12b signed
+ int32_t tmp[16];
+ int i;
+ for (i = 0; i < 4; ++i, in += 64) {
+ const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
+ const int a1 = (in[1 * 16] + in[3 * 16]);
+ const int a2 = (in[1 * 16] - in[3 * 16]);
+ const int a3 = (in[0 * 16] - in[2 * 16]);
+ tmp[0 + i * 4] = a0 + a1; // 14b
+ tmp[1 + i * 4] = a3 + a2;
+ tmp[2 + i * 4] = a3 - a2;
+ tmp[3 + i * 4] = a0 - a1;
+ }
+ for (i = 0; i < 4; ++i) {
+ const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b
+ const int a1 = (tmp[4 + i] + tmp[12+ i]);
+ const int a2 = (tmp[4 + i] - tmp[12+ i]);
+ const int a3 = (tmp[0 + i] - tmp[8 + i]);
+ const int b0 = a0 + a1; // 16b
+ const int b1 = a3 + a2;
+ const int b2 = a3 - a2;
+ const int b3 = a0 - a1;
+ out[ 0 + i] = b0 >> 1; // 15b
+ out[ 4 + i] = b1 >> 1;
+ out[ 8 + i] = b2 >> 1;
+ out[12 + i] = b3 >> 1;
+ }
+}
+#endif // !WEBP_NEON_OMIT_C_CODE
+
+#undef MUL
+#undef STORE
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+ int j;
+ for (j = 0; j < size; ++j) {
+ memset(dst + j * BPS, value, size);
+ }
+}
+
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+ const uint8_t* top, int size) {
+ int j;
+ if (top != NULL) {
+ for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
+ } else {
+ Fill(dst, 127, size);
+ }
+}
+
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+ const uint8_t* left, int size) {
+ if (left != NULL) {
+ int j;
+ for (j = 0; j < size; ++j) {
+ memset(dst + j * BPS, left[j], size);
+ }
+ } else {
+ Fill(dst, 129, size);
+ }
+}
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top, int size) {
+ int y;
+ if (left != NULL) {
+ if (top != NULL) {
+ const uint8_t* const clip = clip1 + 255 - left[-1];
+ for (y = 0; y < size; ++y) {
+ const uint8_t* const clip_table = clip + left[y];
+ int x;
+ for (x = 0; x < size; ++x) {
+ dst[x] = clip_table[top[x]];
+ }
+ dst += BPS;
+ }
+ } else {
+ HorizontalPred(dst, left, size);
+ }
+ } else {
+ // true motion without left samples (hence: with default 129 value)
+ // is equivalent to VE prediction where you just copy the top samples.
+ // Note that if top samples are not available, the default value is
+ // then 129, and not 127 as in the VerticalPred case.
+ if (top != NULL) {
+ VerticalPred(dst, top, size);
+ } else {
+ Fill(dst, 129, size);
+ }
+ }
+}
+
+static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top,
+ int size, int round, int shift) {
+ int DC = 0;
+ int j;
+ if (top != NULL) {
+ for (j = 0; j < size; ++j) DC += top[j];
+ if (left != NULL) { // top and left present
+ for (j = 0; j < size; ++j) DC += left[j];
+ } else { // top, but no left
+ DC += DC;
+ }
+ DC = (DC + round) >> shift;
+ } else if (left != NULL) { // left but no top
+ for (j = 0; j < size; ++j) DC += left[j];
+ DC += DC;
+ DC = (DC + round) >> shift;
+ } else { // no top, no left, nothing.
+ DC = 0x80;
+ }
+ Fill(dst, DC, size);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ // U block
+ DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+ VerticalPred(C8VE8 + dst, top, 8);
+ HorizontalPred(C8HE8 + dst, left, 8);
+ TrueMotion(C8TM8 + dst, left, top, 8);
+ // V block
+ dst += 8;
+ if (top != NULL) top += 8;
+ if (left != NULL) left += 16;
+ DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+ VerticalPred(C8VE8 + dst, top, 8);
+ HorizontalPred(C8HE8 + dst, left, 8);
+ TrueMotion(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds_C(uint8_t* dst,
+ const uint8_t* left, const uint8_t* top) {
+ DCMode(I16DC16 + dst, left, top, 16, 16, 5);
+ VerticalPred(I16VE16 + dst, top, 16);
+ HorizontalPred(I16HE16 + dst, left, 16);
+ TrueMotion(I16TM16 + dst, left, top, 16);
+}
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
+ const uint8_t vals[4] = {
+ AVG3(top[-1], top[0], top[1]),
+ AVG3(top[ 0], top[1], top[2]),
+ AVG3(top[ 1], top[2], top[3]),
+ AVG3(top[ 2], top[3], top[4])
+ };
+ int i;
+ for (i = 0; i < 4; ++i) {
+ memcpy(dst + i * BPS, vals, 4);
+ }
+}
+
+static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+ WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+ WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+ WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static void DC4(uint8_t* dst, const uint8_t* top) {
+ uint32_t dc = 4;
+ int i;
+ for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+ Fill(dst, dc >> 3, 4);
+}
+
+static void RD4(uint8_t* dst, const uint8_t* top) {
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+ const int D = top[3];
+ DST(0, 3) = AVG3(J, K, L);
+ DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
+ DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
+ DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+ DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
+ DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
+ DST(3, 0) = AVG3(D, C, B);
+}
+
+static void LD4(uint8_t* dst, const uint8_t* top) {
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+ const int D = top[3];
+ const int E = top[4];
+ const int F = top[5];
+ const int G = top[6];
+ const int H = top[7];
+ DST(0, 0) = AVG3(A, B, C);
+ DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+ DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
+ DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+ DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+ DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+ DST(3, 3) = AVG3(G, H, H);
+}
+
+static void VR4(uint8_t* dst, const uint8_t* top) {
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+ const int D = top[3];
+ DST(0, 0) = DST(1, 2) = AVG2(X, A);
+ DST(1, 0) = DST(2, 2) = AVG2(A, B);
+ DST(2, 0) = DST(3, 2) = AVG2(B, C);
+ DST(3, 0) = AVG2(C, D);
+
+ DST(0, 3) = AVG3(K, J, I);
+ DST(0, 2) = AVG3(J, I, X);
+ DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+ DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+ DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+ DST(3, 1) = AVG3(B, C, D);
+}
+
+static void VL4(uint8_t* dst, const uint8_t* top) {
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+ const int D = top[3];
+ const int E = top[4];
+ const int F = top[5];
+ const int G = top[6];
+ const int H = top[7];
+ DST(0, 0) = AVG2(A, B);
+ DST(1, 0) = DST(0, 2) = AVG2(B, C);
+ DST(2, 0) = DST(1, 2) = AVG2(C, D);
+ DST(3, 0) = DST(2, 2) = AVG2(D, E);
+
+ DST(0, 1) = AVG3(A, B, C);
+ DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+ DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+ DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+ DST(3, 2) = AVG3(E, F, G);
+ DST(3, 3) = AVG3(F, G, H);
+}
+
+static void HU4(uint8_t* dst, const uint8_t* top) {
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ DST(0, 0) = AVG2(I, J);
+ DST(2, 0) = DST(0, 1) = AVG2(J, K);
+ DST(2, 1) = DST(0, 2) = AVG2(K, L);
+ DST(1, 0) = AVG3(I, J, K);
+ DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+ DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+ DST(3, 2) = DST(2, 2) =
+ DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static void HD4(uint8_t* dst, const uint8_t* top) {
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+
+ DST(0, 0) = DST(2, 1) = AVG2(I, X);
+ DST(0, 1) = DST(2, 2) = AVG2(J, I);
+ DST(0, 2) = DST(2, 3) = AVG2(K, J);
+ DST(0, 3) = AVG2(L, K);
+
+ DST(3, 0) = AVG3(A, B, C);
+ DST(2, 0) = AVG3(X, A, B);
+ DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+ DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+ DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+ DST(1, 3) = AVG3(L, K, J);
+}
+
+static void TM4(uint8_t* dst, const uint8_t* top) {
+ int x, y;
+ const uint8_t* const clip = clip1 + 255 - top[-1];
+ for (y = 0; y < 4; ++y) {
+ const uint8_t* const clip_table = clip + top[-2 - y];
+ for (x = 0; x < 4; ++x) {
+ dst[x] = clip_table[top[x]];
+ }
+ dst += BPS;
+ }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
+ DC4(I4DC4 + dst, top);
+ TM4(I4TM4 + dst, top);
+ VE4(I4VE4 + dst, top);
+ HE4(I4HE4 + dst, top);
+ RD4(I4RD4 + dst, top);
+ VR4(I4VR4 + dst, top);
+ LD4(I4LD4 + dst, top);
+ VL4(I4VL4 + dst, top);
+ HD4(I4HD4 + dst, top);
+ HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+ int w, int h) {
+ int count = 0;
+ int y, x;
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ const int diff = (int)a[x] - b[x];
+ count += diff * diff;
+ }
+ a += BPS;
+ b += BPS;
+ }
+ return count;
+}
+
+static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
+ return GetSSE(a, b, 16, 16);
+}
+static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
+ return GetSSE(a, b, 16, 8);
+}
+static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
+ return GetSSE(a, b, 8, 8);
+}
+static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
+ return GetSSE(a, b, 4, 4);
+}
+#endif // !WEBP_NEON_OMIT_C_CODE
+
+static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
+ int k, x, y;
+ for (k = 0; k < 4; ++k) {
+ uint32_t avg = 0;
+ for (y = 0; y < 4; ++y) {
+ for (x = 0; x < 4; ++x) {
+ avg += ref[x + y * BPS];
+ }
+ }
+ dc[k] = avg;
+ ref += 4; // go to next 4x4 block.
+ }
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+#if !WEBP_NEON_OMIT_C_CODE
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int TTransform(const uint8_t* in, const uint16_t* w) {
+ int sum = 0;
+ int tmp[16];
+ int i;
+ // horizontal pass
+ for (i = 0; i < 4; ++i, in += BPS) {
+ const int a0 = in[0] + in[2];
+ const int a1 = in[1] + in[3];
+ const int a2 = in[1] - in[3];
+ const int a3 = in[0] - in[2];
+ tmp[0 + i * 4] = a0 + a1;
+ tmp[1 + i * 4] = a3 + a2;
+ tmp[2 + i * 4] = a3 - a2;
+ tmp[3 + i * 4] = a0 - a1;
+ }
+ // vertical pass
+ for (i = 0; i < 4; ++i, ++w) {
+ const int a0 = tmp[0 + i] + tmp[8 + i];
+ const int a1 = tmp[4 + i] + tmp[12+ i];
+ const int a2 = tmp[4 + i] - tmp[12+ i];
+ const int a3 = tmp[0 + i] - tmp[8 + i];
+ const int b0 = a0 + a1;
+ const int b1 = a3 + a2;
+ const int b2 = a3 - a2;
+ const int b3 = a0 - a1;
+
+ sum += w[ 0] * abs(b0);
+ sum += w[ 4] * abs(b1);
+ sum += w[ 8] * abs(b2);
+ sum += w[12] * abs(b3);
+ }
+ return sum;
+}
+
+static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ const int sum1 = TTransform(a, w);
+ const int sum2 = TTransform(b, w);
+ return abs(sum2 - sum1) >> 5;
+}
+
+static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ int D = 0;
+ int x, y;
+ for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+ for (x = 0; x < 16; x += 4) {
+ D += Disto4x4_C(a + x + y, b + x + y, w);
+ }
+ }
+ return D;
+}
+#endif // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+static const uint8_t kZigzag[16] = {
+ 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+// Simple quantization
+static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ int last = -1;
+ int n;
+ for (n = 0; n < 16; ++n) {
+ const int j = kZigzag[n];
+ const int sign = (in[j] < 0);
+ const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+ if (coeff > mtx->zthresh_[j]) {
+ const uint32_t Q = mtx->q_[j];
+ const uint32_t iQ = mtx->iq_[j];
+ const uint32_t B = mtx->bias_[j];
+ int level = QUANTDIV(coeff, iQ, B);
+ if (level > MAX_LEVEL) level = MAX_LEVEL;
+ if (sign) level = -level;
+ in[j] = level * (int)Q;
+ out[n] = level;
+ if (level) last = n;
+ } else {
+ out[n] = 0;
+ in[j] = 0;
+ }
+ }
+ return (last >= 0);
+}
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
+ int nz;
+ nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+ nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+ return nz;
+}
+#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+// Block copy
+
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
+ int y;
+ for (y = 0; y < h; ++y) {
+ memcpy(dst, src, w);
+ src += BPS;
+ dst += BPS;
+ }
+}
+
+static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
+ Copy(src, dst, 4, 4);
+}
+
+static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
+ Copy(src, dst, 16, 8);
+}
+
+//------------------------------------------------------------------------------
+// Initialization
+
+// Speed-critical function pointers. We have to initialize them to the default
+// implementations within VP8EncDspInit().
+VP8CHisto VP8CollectHistogram;
+VP8Idct VP8ITransform;
+VP8Fdct VP8FTransform;
+VP8Fdct VP8FTransform2;
+VP8WHT VP8FTransformWHT;
+VP8Intra4Preds VP8EncPredLuma4;
+VP8IntraPreds VP8EncPredLuma16;
+VP8IntraPreds VP8EncPredChroma8;
+VP8Metric VP8SSE16x16;
+VP8Metric VP8SSE8x8;
+VP8Metric VP8SSE16x8;
+VP8Metric VP8SSE4x4;
+VP8WMetric VP8TDisto4x4;
+VP8WMetric VP8TDisto16x16;
+VP8MeanMetric VP8Mean16x4;
+VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8Quantize2Blocks VP8EncQuantize2Blocks;
+VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
+VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy16x8;
+
+extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitSSE41(void);
+extern void VP8EncDspInitNEON(void);
+extern void VP8EncDspInitMIPS32(void);
+extern void VP8EncDspInitMIPSdspR2(void);
+extern void VP8EncDspInitMSA(void);
+
+WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
+ VP8DspInit(); // common inverse transforms
+ InitTables();
+
+ // default C implementations
+#if !WEBP_NEON_OMIT_C_CODE
+ VP8ITransform = ITransform_C;
+ VP8FTransform = FTransform_C;
+ VP8FTransformWHT = FTransformWHT_C;
+ VP8TDisto4x4 = Disto4x4_C;
+ VP8TDisto16x16 = Disto16x16_C;
+ VP8CollectHistogram = CollectHistogram_C;
+ VP8SSE16x16 = SSE16x16_C;
+ VP8SSE16x8 = SSE16x8_C;
+ VP8SSE8x8 = SSE8x8_C;
+ VP8SSE4x4 = SSE4x4_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+ VP8EncQuantizeBlock = QuantizeBlock_C;
+ VP8EncQuantize2Blocks = Quantize2Blocks_C;
+#endif
+
+ VP8FTransform2 = FTransform2_C;
+ VP8EncPredLuma4 = Intra4Preds_C;
+ VP8EncPredLuma16 = Intra16Preds_C;
+ VP8EncPredChroma8 = IntraChromaPreds_C;
+ VP8Mean16x4 = Mean16x4_C;
+ VP8EncQuantizeBlockWHT = QuantizeBlock_C;
+ VP8Copy4x4 = Copy4x4_C;
+ VP8Copy16x8 = Copy16x8_C;
+
+ // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+ if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+ if (VP8GetCPUInfo(kSSE2)) {
+ VP8EncDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+ if (VP8GetCPUInfo(kSSE4_1)) {
+ VP8EncDspInitSSE41();
+ }
+#endif
+ }
+#endif
+#if defined(WEBP_USE_MIPS32)
+ if (VP8GetCPUInfo(kMIPS32)) {
+ VP8EncDspInitMIPS32();
+ }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ VP8EncDspInitMIPSdspR2();
+ }
+#endif
+#if defined(WEBP_USE_MSA)
+ if (VP8GetCPUInfo(kMSA)) {
+ VP8EncDspInitMSA();
+ }
+#endif
+ }
+
+#if defined(WEBP_HAVE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ VP8EncDspInitNEON();
+ }
+#endif
+
+ assert(VP8ITransform != NULL);
+ assert(VP8FTransform != NULL);
+ assert(VP8FTransformWHT != NULL);
+ assert(VP8TDisto4x4 != NULL);
+ assert(VP8TDisto16x16 != NULL);
+ assert(VP8CollectHistogram != NULL);
+ assert(VP8SSE16x16 != NULL);
+ assert(VP8SSE16x8 != NULL);
+ assert(VP8SSE8x8 != NULL);
+ assert(VP8SSE4x4 != NULL);
+ assert(VP8EncQuantizeBlock != NULL);
+ assert(VP8EncQuantize2Blocks != NULL);
+ assert(VP8FTransform2 != NULL);
+ assert(VP8EncPredLuma4 != NULL);
+ assert(VP8EncPredLuma16 != NULL);
+ assert(VP8EncPredChroma8 != NULL);
+ assert(VP8Mean16x4 != NULL);
+ assert(VP8EncQuantizeBlockWHT != NULL);
+ assert(VP8Copy4x4 != NULL);
+ assert(VP8Copy16x8 != NULL);
+}
diff --git a/media/libwebp/dsp/enc_mips32.c b/media/libwebp/dsp/enc_mips32.c
new file mode 100644
index 0000000000..ee26dfb493
--- /dev/null
+++ b/media/libwebp/dsp/enc_mips32.c
@@ -0,0 +1,677 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+// Slobodan Prijic (slobodan.prijic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../dsp/mips_macro.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+// macro for one vertical pass in ITransformOne
+// MUL macro inlined
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to load from in buffer
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+// TEMP4..TEMP5 - temporary registers
+#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
+ "lh %[temp16], " #A "(%[temp20]) \n\t" \
+ "lh %[temp18], " #B "(%[temp20]) \n\t" \
+ "lh %[temp17], " #C "(%[temp20]) \n\t" \
+ "lh %[temp19], " #D "(%[temp20]) \n\t" \
+ "addu %[" #TEMP4 "], %[temp16], %[temp18] \n\t" \
+ "subu %[temp16], %[temp16], %[temp18] \n\t" \
+ "mul %[" #TEMP0 "], %[temp17], %[kC2] \n\t" \
+ "mul %[temp18], %[temp19], %[kC1] \n\t" \
+ "mul %[temp17], %[temp17], %[kC1] \n\t" \
+ "mul %[temp19], %[temp19], %[kC2] \n\t" \
+ "sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\n" \
+ "sra %[temp18], %[temp18], 16 \n\n" \
+ "sra %[temp17], %[temp17], 16 \n\n" \
+ "sra %[temp19], %[temp19], 16 \n\n" \
+ "subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp18] \n\t" \
+ "addu %[" #TEMP3 "], %[temp17], %[temp19] \n\t" \
+ "addu %[" #TEMP0 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t" \
+ "addu %[" #TEMP1 "], %[temp16], %[" #TEMP2 "] \n\t" \
+ "subu %[" #TEMP2 "], %[temp16], %[" #TEMP2 "] \n\t" \
+ "subu %[" #TEMP3 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t"
+
+// macro for one horizontal pass in ITransformOne
+// MUL and STORE macros inlined
+// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from ref and store to dst buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
+ "addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
+ "addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
+ "subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
+ "mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \
+ "mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \
+ "mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \
+ "mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \
+ "sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
+ "sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
+ "sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \
+ "sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
+ "subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
+ "addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \
+ "addu %[" #TEMP0 "], %[temp16], %[temp19] \n\t" \
+ "addu %[" #TEMP4 "], %[temp17], %[temp18] \n\t" \
+ "subu %[" #TEMP8 "], %[temp17], %[temp18] \n\t" \
+ "subu %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
+ "lw %[temp20], 0(%[args]) \n\t" \
+ "sra %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \
+ "sra %[" #TEMP4 "], %[" #TEMP4 "], 3 \n\t" \
+ "sra %[" #TEMP8 "], %[" #TEMP8 "], 3 \n\t" \
+ "sra %[" #TEMP12 "], %[" #TEMP12 "], 3 \n\t" \
+ "lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
+ "lbu %[temp17], 1+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
+ "lbu %[temp18], 2+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
+ "lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
+ "addu %[" #TEMP0 "], %[temp16], %[" #TEMP0 "] \n\t" \
+ "addu %[" #TEMP4 "], %[temp17], %[" #TEMP4 "] \n\t" \
+ "addu %[" #TEMP8 "], %[temp18], %[" #TEMP8 "] \n\t" \
+ "addu %[" #TEMP12 "], %[temp19], %[" #TEMP12 "] \n\t" \
+ "slt %[temp16], %[" #TEMP0 "], $zero \n\t" \
+ "slt %[temp17], %[" #TEMP4 "], $zero \n\t" \
+ "slt %[temp18], %[" #TEMP8 "], $zero \n\t" \
+ "slt %[temp19], %[" #TEMP12 "], $zero \n\t" \
+ "movn %[" #TEMP0 "], $zero, %[temp16] \n\t" \
+ "movn %[" #TEMP4 "], $zero, %[temp17] \n\t" \
+ "movn %[" #TEMP8 "], $zero, %[temp18] \n\t" \
+ "movn %[" #TEMP12 "], $zero, %[temp19] \n\t" \
+ "addiu %[temp20], $zero, 255 \n\t" \
+ "slt %[temp16], %[" #TEMP0 "], %[temp20] \n\t" \
+ "slt %[temp17], %[" #TEMP4 "], %[temp20] \n\t" \
+ "slt %[temp18], %[" #TEMP8 "], %[temp20] \n\t" \
+ "slt %[temp19], %[" #TEMP12 "], %[temp20] \n\t" \
+ "movz %[" #TEMP0 "], %[temp20], %[temp16] \n\t" \
+ "movz %[" #TEMP4 "], %[temp20], %[temp17] \n\t" \
+ "lw %[temp16], 8(%[args]) \n\t" \
+ "movz %[" #TEMP8 "], %[temp20], %[temp18] \n\t" \
+ "movz %[" #TEMP12 "], %[temp20], %[temp19] \n\t" \
+ "sb %[" #TEMP0 "], 0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
+ "sb %[" #TEMP4 "], 1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
+ "sb %[" #TEMP8 "], 2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
+ "sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
+
+// Does one or two inverse transforms.
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
+ const int16_t* in,
+ uint8_t* dst) {
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+ int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
+ int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
+ const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
+
+ __asm__ volatile(
+ "lw %[temp20], 4(%[args]) \n\t"
+ VERTICAL_PASS(0, 16, 8, 24, temp4, temp0, temp1, temp2, temp3)
+ VERTICAL_PASS(2, 18, 10, 26, temp8, temp4, temp5, temp6, temp7)
+ VERTICAL_PASS(4, 20, 12, 28, temp12, temp8, temp9, temp10, temp11)
+ VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
+
+ HORIZONTAL_PASS(0, temp0, temp4, temp8, temp12)
+ HORIZONTAL_PASS(1, temp1, temp5, temp9, temp13)
+ HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
+ HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)
+
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+ [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+ [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+ [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+ : [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
+ : "memory", "hi", "lo"
+ );
+}
+
+static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
+ uint8_t* dst, int do_two) {
+ ITransformOne_MIPS32(ref, in, dst);
+ if (do_two) {
+ ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
+ }
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+// macro for one pass through for loop in QuantizeBlock
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+#define QUANTIZE_ONE(J, K, N) \
+ "lh %[temp0], " #J "(%[ppin]) \n\t" \
+ "lhu %[temp1], " #J "(%[ppsharpen]) \n\t" \
+ "lw %[temp2], " #K "(%[ppzthresh]) \n\t" \
+ "sra %[sign], %[temp0], 15 \n\t" \
+ "xor %[coeff], %[temp0], %[sign] \n\t" \
+ "subu %[coeff], %[coeff], %[sign] \n\t" \
+ "addu %[coeff], %[coeff], %[temp1] \n\t" \
+ "slt %[temp4], %[temp2], %[coeff] \n\t" \
+ "addiu %[temp5], $zero, 0 \n\t" \
+ "addiu %[level], $zero, 0 \n\t" \
+ "beqz %[temp4], 2f \n\t" \
+ "lhu %[temp1], " #J "(%[ppiq]) \n\t" \
+ "lw %[temp2], " #K "(%[ppbias]) \n\t" \
+ "lhu %[temp3], " #J "(%[ppq]) \n\t" \
+ "mul %[level], %[coeff], %[temp1] \n\t" \
+ "addu %[level], %[level], %[temp2] \n\t" \
+ "sra %[level], %[level], 17 \n\t" \
+ "slt %[temp4], %[max_level], %[level] \n\t" \
+ "movn %[level], %[max_level], %[temp4] \n\t" \
+ "xor %[level], %[level], %[sign] \n\t" \
+ "subu %[level], %[level], %[sign] \n\t" \
+ "mul %[temp5], %[level], %[temp3] \n\t" \
+"2: \n\t" \
+ "sh %[temp5], " #J "(%[ppin]) \n\t" \
+ "sh %[level], " #N "(%[pout]) \n\t"
+
+static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ int sign, coeff, level, i;
+ int max_level = MAX_LEVEL;
+
+ int16_t* ppin = &in[0];
+ int16_t* pout = &out[0];
+ const uint16_t* ppsharpen = &mtx->sharpen_[0];
+ const uint32_t* ppzthresh = &mtx->zthresh_[0];
+ const uint16_t* ppq = &mtx->q_[0];
+ const uint16_t* ppiq = &mtx->iq_[0];
+ const uint32_t* ppbias = &mtx->bias_[0];
+
+ __asm__ volatile(
+ QUANTIZE_ONE( 0, 0, 0)
+ QUANTIZE_ONE( 2, 4, 2)
+ QUANTIZE_ONE( 8, 16, 4)
+ QUANTIZE_ONE(16, 32, 6)
+ QUANTIZE_ONE(10, 20, 8)
+ QUANTIZE_ONE( 4, 8, 10)
+ QUANTIZE_ONE( 6, 12, 12)
+ QUANTIZE_ONE(12, 24, 14)
+ QUANTIZE_ONE(18, 36, 16)
+ QUANTIZE_ONE(24, 48, 18)
+ QUANTIZE_ONE(26, 52, 20)
+ QUANTIZE_ONE(20, 40, 22)
+ QUANTIZE_ONE(14, 28, 24)
+ QUANTIZE_ONE(22, 44, 26)
+ QUANTIZE_ONE(28, 56, 28)
+ QUANTIZE_ONE(30, 60, 30)
+
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+ [level]"=&r"(level)
+ : [pout]"r"(pout), [ppin]"r"(ppin),
+ [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+ [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+ [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+ : "memory", "hi", "lo"
+ );
+
+ // moved out from macro to increase possibility for earlier breaking
+ for (i = 15; i >= 0; i--) {
+ if (out[i]) return 1;
+ }
+ return 0;
+}
+
+static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
+ int nz;
+ nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
+ nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
+ return nz;
+}
+
+#undef QUANTIZE_ONE
+
+// macro for one horizontal pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// A - offset in bytes to load from a and b buffers
+// E..H - offsets in bytes to store first results to tmp buffer
+// E1..H1 - offsets in bytes to store second results to tmp buffer
+#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1) \
+ "lbu %[temp0], 0+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
+ "lbu %[temp1], 1+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
+ "lbu %[temp2], 2+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
+ "lbu %[temp3], 3+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
+ "lbu %[temp4], 0+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
+ "lbu %[temp5], 1+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
+ "lbu %[temp6], 2+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
+ "lbu %[temp7], 3+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
+ "addu %[temp8], %[temp0], %[temp2] \n\t" \
+ "subu %[temp0], %[temp0], %[temp2] \n\t" \
+ "addu %[temp2], %[temp1], %[temp3] \n\t" \
+ "subu %[temp1], %[temp1], %[temp3] \n\t" \
+ "addu %[temp3], %[temp4], %[temp6] \n\t" \
+ "subu %[temp4], %[temp4], %[temp6] \n\t" \
+ "addu %[temp6], %[temp5], %[temp7] \n\t" \
+ "subu %[temp5], %[temp5], %[temp7] \n\t" \
+ "addu %[temp7], %[temp8], %[temp2] \n\t" \
+ "subu %[temp2], %[temp8], %[temp2] \n\t" \
+ "addu %[temp8], %[temp0], %[temp1] \n\t" \
+ "subu %[temp0], %[temp0], %[temp1] \n\t" \
+ "addu %[temp1], %[temp3], %[temp6] \n\t" \
+ "subu %[temp3], %[temp3], %[temp6] \n\t" \
+ "addu %[temp6], %[temp4], %[temp5] \n\t" \
+ "subu %[temp4], %[temp4], %[temp5] \n\t" \
+ "sw %[temp7], " #E "(%[tmp]) \n\t" \
+ "sw %[temp2], " #H "(%[tmp]) \n\t" \
+ "sw %[temp8], " #F "(%[tmp]) \n\t" \
+ "sw %[temp0], " #G "(%[tmp]) \n\t" \
+ "sw %[temp1], " #E1 "(%[tmp]) \n\t" \
+ "sw %[temp3], " #H1 "(%[tmp]) \n\t" \
+ "sw %[temp6], " #F1 "(%[tmp]) \n\t" \
+ "sw %[temp4], " #G1 "(%[tmp]) \n\t"
+
+// macro for one vertical pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// since only one accu is available in mips32r1 instruction set
+// first is done second call of function TTransform and after
+// that first one.
+// const int sum1 = TTransform(a, w);
+// const int sum2 = TTransform(b, w);
+// return abs(sum2 - sum1) >> 5;
+// (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
+// A..D - offsets in bytes to load first results from tmp buffer
+// A1..D1 - offsets in bytes to load second results from tmp buffer
+// E..H - offsets in bytes to load from w buffer
+#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H) \
+ "lw %[temp0], " #A1 "(%[tmp]) \n\t" \
+ "lw %[temp1], " #C1 "(%[tmp]) \n\t" \
+ "lw %[temp2], " #B1 "(%[tmp]) \n\t" \
+ "lw %[temp3], " #D1 "(%[tmp]) \n\t" \
+ "addu %[temp8], %[temp0], %[temp1] \n\t" \
+ "subu %[temp0], %[temp0], %[temp1] \n\t" \
+ "addu %[temp1], %[temp2], %[temp3] \n\t" \
+ "subu %[temp2], %[temp2], %[temp3] \n\t" \
+ "addu %[temp3], %[temp8], %[temp1] \n\t" \
+ "subu %[temp8], %[temp8], %[temp1] \n\t" \
+ "addu %[temp1], %[temp0], %[temp2] \n\t" \
+ "subu %[temp0], %[temp0], %[temp2] \n\t" \
+ "sra %[temp4], %[temp3], 31 \n\t" \
+ "sra %[temp5], %[temp1], 31 \n\t" \
+ "sra %[temp6], %[temp0], 31 \n\t" \
+ "sra %[temp7], %[temp8], 31 \n\t" \
+ "xor %[temp3], %[temp3], %[temp4] \n\t" \
+ "xor %[temp1], %[temp1], %[temp5] \n\t" \
+ "xor %[temp0], %[temp0], %[temp6] \n\t" \
+ "xor %[temp8], %[temp8], %[temp7] \n\t" \
+ "subu %[temp3], %[temp3], %[temp4] \n\t" \
+ "subu %[temp1], %[temp1], %[temp5] \n\t" \
+ "subu %[temp0], %[temp0], %[temp6] \n\t" \
+ "subu %[temp8], %[temp8], %[temp7] \n\t" \
+ "lhu %[temp4], " #E "(%[w]) \n\t" \
+ "lhu %[temp5], " #F "(%[w]) \n\t" \
+ "lhu %[temp6], " #G "(%[w]) \n\t" \
+ "lhu %[temp7], " #H "(%[w]) \n\t" \
+ "madd %[temp4], %[temp3] \n\t" \
+ "madd %[temp5], %[temp1] \n\t" \
+ "madd %[temp6], %[temp0] \n\t" \
+ "madd %[temp7], %[temp8] \n\t" \
+ "lw %[temp0], " #A "(%[tmp]) \n\t" \
+ "lw %[temp1], " #C "(%[tmp]) \n\t" \
+ "lw %[temp2], " #B "(%[tmp]) \n\t" \
+ "lw %[temp3], " #D "(%[tmp]) \n\t" \
+ "addu %[temp8], %[temp0], %[temp1] \n\t" \
+ "subu %[temp0], %[temp0], %[temp1] \n\t" \
+ "addu %[temp1], %[temp2], %[temp3] \n\t" \
+ "subu %[temp2], %[temp2], %[temp3] \n\t" \
+ "addu %[temp3], %[temp8], %[temp1] \n\t" \
+ "subu %[temp1], %[temp8], %[temp1] \n\t" \
+ "addu %[temp8], %[temp0], %[temp2] \n\t" \
+ "subu %[temp0], %[temp0], %[temp2] \n\t" \
+ "sra %[temp2], %[temp3], 31 \n\t" \
+ "xor %[temp3], %[temp3], %[temp2] \n\t" \
+ "subu %[temp3], %[temp3], %[temp2] \n\t" \
+ "msub %[temp4], %[temp3] \n\t" \
+ "sra %[temp2], %[temp8], 31 \n\t" \
+ "sra %[temp3], %[temp0], 31 \n\t" \
+ "sra %[temp4], %[temp1], 31 \n\t" \
+ "xor %[temp8], %[temp8], %[temp2] \n\t" \
+ "xor %[temp0], %[temp0], %[temp3] \n\t" \
+ "xor %[temp1], %[temp1], %[temp4] \n\t" \
+ "subu %[temp8], %[temp8], %[temp2] \n\t" \
+ "subu %[temp0], %[temp0], %[temp3] \n\t" \
+ "subu %[temp1], %[temp1], %[temp4] \n\t" \
+ "msub %[temp5], %[temp8] \n\t" \
+ "msub %[temp6], %[temp0] \n\t" \
+ "msub %[temp7], %[temp1] \n\t"
+
+static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ int tmp[32];
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+
+ __asm__ volatile(
+ HORIZONTAL_PASS(0, 0, 4, 8, 12, 64, 68, 72, 76)
+ HORIZONTAL_PASS(1, 16, 20, 24, 28, 80, 84, 88, 92)
+ HORIZONTAL_PASS(2, 32, 36, 40, 44, 96, 100, 104, 108)
+ HORIZONTAL_PASS(3, 48, 52, 56, 60, 112, 116, 120, 124)
+ "mthi $zero \n\t"
+ "mtlo $zero \n\t"
+ VERTICAL_PASS( 0, 16, 32, 48, 64, 80, 96, 112, 0, 8, 16, 24)
+ VERTICAL_PASS( 4, 20, 36, 52, 68, 84, 100, 116, 2, 10, 18, 26)
+ VERTICAL_PASS( 8, 24, 40, 56, 72, 88, 104, 120, 4, 12, 20, 28)
+ VERTICAL_PASS(12, 28, 44, 60, 76, 92, 108, 124, 6, 14, 22, 30)
+ "mflo %[temp0] \n\t"
+ "sra %[temp1], %[temp0], 31 \n\t"
+ "xor %[temp0], %[temp0], %[temp1] \n\t"
+ "subu %[temp0], %[temp0], %[temp1] \n\t"
+ "sra %[temp0], %[temp0], 5 \n\t"
+
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+ : [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
+ : "memory", "hi", "lo"
+ );
+
+ return temp0;
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ int D = 0;
+ int x, y;
+ for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+ for (x = 0; x < 16; x += 4) {
+ D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
+ }
+ }
+ return D;
+}
+
+// macro for one horizontal pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from src and ref buffers
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \
+ "lw %[" #TEMP1 "], 0(%[args]) \n\t" \
+ "lw %[" #TEMP2 "], 4(%[args]) \n\t" \
+ "lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
+ "lbu %[temp17], 0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
+ "lbu %[temp18], 1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
+ "lbu %[temp19], 1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
+ "subu %[temp20], %[temp16], %[temp17] \n\t" \
+ "lbu %[temp16], 2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
+ "lbu %[temp17], 2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
+ "subu %[" #TEMP0 "], %[temp18], %[temp19] \n\t" \
+ "lbu %[temp18], 3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
+ "lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
+ "subu %[" #TEMP1 "], %[temp16], %[temp17] \n\t" \
+ "subu %[" #TEMP2 "], %[temp18], %[temp19] \n\t" \
+ "addu %[" #TEMP3 "], %[temp20], %[" #TEMP2 "] \n\t" \
+ "subu %[" #TEMP2 "], %[temp20], %[" #TEMP2 "] \n\t" \
+ "addu %[temp20], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
+ "subu %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
+ "mul %[temp16], %[" #TEMP2 "], %[c5352] \n\t" \
+ "mul %[temp17], %[" #TEMP2 "], %[c2217] \n\t" \
+ "mul %[temp18], %[" #TEMP0 "], %[c5352] \n\t" \
+ "mul %[temp19], %[" #TEMP0 "], %[c2217] \n\t" \
+ "addu %[" #TEMP1 "], %[" #TEMP3 "], %[temp20] \n\t" \
+ "subu %[temp20], %[" #TEMP3 "], %[temp20] \n\t" \
+ "sll %[" #TEMP0 "], %[" #TEMP1 "], 3 \n\t" \
+ "sll %[" #TEMP2 "], %[temp20], 3 \n\t" \
+ "addiu %[temp16], %[temp16], 1812 \n\t" \
+ "addiu %[temp17], %[temp17], 937 \n\t" \
+ "addu %[temp16], %[temp16], %[temp19] \n\t" \
+ "subu %[temp17], %[temp17], %[temp18] \n\t" \
+ "sra %[" #TEMP1 "], %[temp16], 9 \n\t" \
+ "sra %[" #TEMP3 "], %[temp17], 9 \n\t"
+
+// macro for one vertical pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to store to out buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \
+ "addu %[temp16], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
+ "subu %[temp19], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
+ "addu %[temp17], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
+ "subu %[temp18], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
+ "mul %[" #TEMP8 "], %[temp19], %[c2217] \n\t" \
+ "mul %[" #TEMP12 "], %[temp18], %[c2217] \n\t" \
+ "mul %[" #TEMP4 "], %[temp19], %[c5352] \n\t" \
+ "mul %[temp18], %[temp18], %[c5352] \n\t" \
+ "addiu %[temp16], %[temp16], 7 \n\t" \
+ "addu %[" #TEMP0 "], %[temp16], %[temp17] \n\t" \
+ "sra %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
+ "addu %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "] \n\t" \
+ "subu %[" #TEMP4 "], %[temp16], %[temp17] \n\t" \
+ "sra %[" #TEMP4 "], %[" #TEMP4 "], 4 \n\t" \
+ "addiu %[" #TEMP8 "], %[" #TEMP8 "], 30000 \n\t" \
+ "addiu %[" #TEMP12 "], %[" #TEMP12 "], 12000 \n\t" \
+ "addiu %[" #TEMP8 "], %[" #TEMP8 "], 21000 \n\t" \
+ "subu %[" #TEMP8 "], %[" #TEMP8 "], %[temp18] \n\t" \
+ "sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
+ "sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
+ "addiu %[temp16], %[" #TEMP12 "], 1 \n\t" \
+ "movn %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
+ "sh %[" #TEMP0 "], " #A "(%[temp20]) \n\t" \
+ "sh %[" #TEMP4 "], " #C "(%[temp20]) \n\t" \
+ "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
+ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
+
+static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+ int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+ int temp17, temp18, temp19, temp20;
+ const int c2217 = 2217;
+ const int c5352 = 5352;
+ const int* const args[3] =
+ { (const int*)src, (const int*)ref, (const int*)out };
+
+ __asm__ volatile(
+ HORIZONTAL_PASS(0, temp0, temp1, temp2, temp3)
+ HORIZONTAL_PASS(1, temp4, temp5, temp6, temp7)
+ HORIZONTAL_PASS(2, temp8, temp9, temp10, temp11)
+ HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
+ "lw %[temp20], 8(%[args]) \n\t"
+ VERTICAL_PASS(0, 8, 16, 24, temp0, temp4, temp8, temp12)
+ VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9, temp13)
+ VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
+ VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
+
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+ [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+ [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+ [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+ : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
+ : "memory", "hi", "lo"
+ );
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+#if !defined(WORK_AROUND_GCC)
+
+#define GET_SSE_INNER(A, B, C, D) \
+ "lbu %[temp0], " #A "(%[a]) \n\t" \
+ "lbu %[temp1], " #A "(%[b]) \n\t" \
+ "lbu %[temp2], " #B "(%[a]) \n\t" \
+ "lbu %[temp3], " #B "(%[b]) \n\t" \
+ "lbu %[temp4], " #C "(%[a]) \n\t" \
+ "lbu %[temp5], " #C "(%[b]) \n\t" \
+ "lbu %[temp6], " #D "(%[a]) \n\t" \
+ "lbu %[temp7], " #D "(%[b]) \n\t" \
+ "subu %[temp0], %[temp0], %[temp1] \n\t" \
+ "subu %[temp2], %[temp2], %[temp3] \n\t" \
+ "subu %[temp4], %[temp4], %[temp5] \n\t" \
+ "subu %[temp6], %[temp6], %[temp7] \n\t" \
+ "madd %[temp0], %[temp0] \n\t" \
+ "madd %[temp2], %[temp2] \n\t" \
+ "madd %[temp4], %[temp4] \n\t" \
+ "madd %[temp6], %[temp6] \n\t"
+
+#define GET_SSE(A, B, C, D) \
+ GET_SSE_INNER(A, A + 1, A + 2, A + 3) \
+ GET_SSE_INNER(B, B + 1, B + 2, B + 3) \
+ GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
+ GET_SSE_INNER(D, D + 1, D + 2, D + 3)
+
+static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
+ int count;
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+ __asm__ volatile(
+ "mult $zero, $zero \n\t"
+
+ GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
+ GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
+ GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
+ GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
+ GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
+ GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
+ GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
+ GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
+ GET_SSE( 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS)
+ GET_SSE( 9 * BPS, 4 + 9 * BPS, 8 + 9 * BPS, 12 + 9 * BPS)
+ GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+ GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+ GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+ GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+ GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+ GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
+
+ "mflo %[count] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+ : [a]"r"(a), [b]"r"(b)
+ : "memory", "hi", "lo"
+ );
+ return count;
+}
+
+static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+ int count;
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+ __asm__ volatile(
+ "mult $zero, $zero \n\t"
+
+ GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
+ GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
+ GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
+ GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
+ GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
+ GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
+ GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
+ GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
+
+ "mflo %[count] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+ : [a]"r"(a), [b]"r"(b)
+ : "memory", "hi", "lo"
+ );
+ return count;
+}
+
+static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+ int count;
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+ __asm__ volatile(
+ "mult $zero, $zero \n\t"
+
+ GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+ GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+ GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+ GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
+
+ "mflo %[count] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+ : [a]"r"(a), [b]"r"(b)
+ : "memory", "hi", "lo"
+ );
+ return count;
+}
+
+static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
+ int count;
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+ __asm__ volatile(
+ "mult $zero, $zero \n\t"
+
+ GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
+
+ "mflo %[count] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+ : [a]"r"(a), [b]"r"(b)
+ : "memory", "hi", "lo"
+ );
+ return count;
+}
+
+#undef GET_SSE
+#undef GET_SSE_INNER
+
+#endif // !WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
+ VP8ITransform = ITransform_MIPS32;
+ VP8FTransform = FTransform_MIPS32;
+
+ VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
+ VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
+
+ VP8TDisto4x4 = Disto4x4_MIPS32;
+ VP8TDisto16x16 = Disto16x16_MIPS32;
+
+#if !defined(WORK_AROUND_GCC)
+ VP8SSE16x16 = SSE16x16_MIPS32;
+ VP8SSE8x8 = SSE8x8_MIPS32;
+ VP8SSE16x8 = SSE16x8_MIPS32;
+ VP8SSE4x4 = SSE4x4_MIPS32;
+#endif
+}
+
+#else // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
+
+#endif // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/enc_mips_dsp_r2.c b/media/libwebp/dsp/enc_mips_dsp_r2.c
new file mode 100644
index 0000000000..cf4c85b59c
--- /dev/null
+++ b/media/libwebp/dsp/enc_mips_dsp_r2.c
@@ -0,0 +1,1517 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Darko Laus (darko.laus@imgtec.com)
+// Mirko Raus (mirko.raus@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/mips_macro.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+// O - output
+// I - input (macro doesn't change it)
+#define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7, \
+ I0, I1, I2, I3, I4, I5, I6, I7) \
+ "addq.ph %[" #O0 "], %[" #I0 "], %[" #I1 "] \n\t" \
+ "subq.ph %[" #O1 "], %[" #I0 "], %[" #I1 "] \n\t" \
+ "addq.ph %[" #O2 "], %[" #I2 "], %[" #I3 "] \n\t" \
+ "subq.ph %[" #O3 "], %[" #I2 "], %[" #I3 "] \n\t" \
+ "addq.ph %[" #O4 "], %[" #I4 "], %[" #I5 "] \n\t" \
+ "subq.ph %[" #O5 "], %[" #I4 "], %[" #I5 "] \n\t" \
+ "addq.ph %[" #O6 "], %[" #I6 "], %[" #I7 "] \n\t" \
+ "subq.ph %[" #O7 "], %[" #I6 "], %[" #I7 "] \n\t"
+
+// IO - input/output
+#define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7) \
+ "absq_s.ph %[" #IO0 "], %[" #IO0 "] \n\t" \
+ "absq_s.ph %[" #IO1 "], %[" #IO1 "] \n\t" \
+ "absq_s.ph %[" #IO2 "], %[" #IO2 "] \n\t" \
+ "absq_s.ph %[" #IO3 "], %[" #IO3 "] \n\t" \
+ "absq_s.ph %[" #IO4 "], %[" #IO4 "] \n\t" \
+ "absq_s.ph %[" #IO5 "], %[" #IO5 "] \n\t" \
+ "absq_s.ph %[" #IO6 "], %[" #IO6 "] \n\t" \
+ "absq_s.ph %[" #IO7 "], %[" #IO7 "] \n\t"
+
+// dpa.w.ph $ac0 temp0 ,temp1
+// $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
+// dpax.w.ph $ac0 temp0 ,temp1
+// $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
+// O - output
+// I - input (macro doesn't change it)
+#define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7, \
+ I8, I9, I10, I11, I12, I13, I14, I15) \
+ "mult $ac0, $zero, $zero \n\t" \
+ "dpa.w.ph $ac0, %[" #I2 "], %[" #I0 "] \n\t" \
+ "dpax.w.ph $ac0, %[" #I5 "], %[" #I6 "] \n\t" \
+ "dpa.w.ph $ac0, %[" #I8 "], %[" #I9 "] \n\t" \
+ "dpax.w.ph $ac0, %[" #I11 "], %[" #I4 "] \n\t" \
+ "dpa.w.ph $ac0, %[" #I12 "], %[" #I7 "] \n\t" \
+ "dpax.w.ph $ac0, %[" #I13 "], %[" #I1 "] \n\t" \
+ "dpa.w.ph $ac0, %[" #I14 "], %[" #I3 "] \n\t" \
+ "dpax.w.ph $ac0, %[" #I15 "], %[" #I10 "] \n\t" \
+ "mflo %[" #O0 "], $ac0 \n\t"
+
+#define OUTPUT_EARLY_CLOBBER_REGS_17() \
+ OUTPUT_EARLY_CLOBBER_REGS_10(), \
+ [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), \
+ [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), \
+ [temp17]"=&r"(temp17)
+
+// macro for one horizontal pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from src and ref buffers
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \
+ "lw %[" #TEMP0 "], 0(%[args]) \n\t" \
+ "lw %[" #TEMP1 "], 4(%[args]) \n\t" \
+ "lw %[" #TEMP2 "], " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t" \
+ "lw %[" #TEMP3 "], " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
+ "preceu.ph.qbl %[" #TEMP0 "], %[" #TEMP2 "] \n\t" \
+ "preceu.ph.qbl %[" #TEMP1 "], %[" #TEMP3 "] \n\t" \
+ "preceu.ph.qbr %[" #TEMP2 "], %[" #TEMP2 "] \n\t" \
+ "preceu.ph.qbr %[" #TEMP3 "], %[" #TEMP3 "] \n\t" \
+ "subq.ph %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
+ "subq.ph %[" #TEMP2 "], %[" #TEMP2 "], %[" #TEMP3 "] \n\t" \
+ "rotr %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
+ "addq.ph %[" #TEMP1 "], %[" #TEMP2 "], %[" #TEMP0 "] \n\t" \
+ "subq.ph %[" #TEMP3 "], %[" #TEMP2 "], %[" #TEMP0 "] \n\t" \
+ "seh %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
+ "sra %[temp16], %[" #TEMP1 "], 16 \n\t" \
+ "seh %[temp19], %[" #TEMP3 "] \n\t" \
+ "sra %[" #TEMP3 "], %[" #TEMP3 "], 16 \n\t" \
+ "subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp16] \n\t" \
+ "addu %[" #TEMP0 "], %[" #TEMP0 "], %[temp16] \n\t" \
+ "mul %[temp17], %[temp19], %[c2217] \n\t" \
+ "mul %[temp18], %[" #TEMP3 "], %[c5352] \n\t" \
+ "mul %[" #TEMP1 "], %[temp19], %[c5352] \n\t" \
+ "mul %[temp16], %[" #TEMP3 "], %[c2217] \n\t" \
+ "sll %[" #TEMP2 "], %[" #TEMP2 "], 3 \n\t" \
+ "sll %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \
+ "subu %[" #TEMP3 "], %[temp17], %[temp18] \n\t" \
+ "addu %[" #TEMP1 "], %[temp16], %[" #TEMP1 "] \n\t" \
+ "addiu %[" #TEMP3 "], %[" #TEMP3 "], 937 \n\t" \
+ "addiu %[" #TEMP1 "], %[" #TEMP1 "], 1812 \n\t" \
+ "sra %[" #TEMP3 "], %[" #TEMP3 "], 9 \n\t" \
+ "sra %[" #TEMP1 "], %[" #TEMP1 "], 9 \n\t"
+
+// macro for one vertical pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to store to out buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \
+ "addu %[temp16], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
+ "subu %[temp19], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
+ "addu %[temp17], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
+ "subu %[temp18], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
+ "mul %[" #TEMP8 "], %[temp19], %[c2217] \n\t" \
+ "mul %[" #TEMP12 "], %[temp18], %[c2217] \n\t" \
+ "mul %[" #TEMP4 "], %[temp19], %[c5352] \n\t" \
+ "mul %[temp18], %[temp18], %[c5352] \n\t" \
+ "addiu %[temp16], %[temp16], 7 \n\t" \
+ "addu %[" #TEMP0 "], %[temp16], %[temp17] \n\t" \
+ "sra %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
+ "addu %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "] \n\t" \
+ "subu %[" #TEMP4 "], %[temp16], %[temp17] \n\t" \
+ "sra %[" #TEMP4 "], %[" #TEMP4 "], 4 \n\t" \
+ "addiu %[" #TEMP8 "], %[" #TEMP8 "], 30000 \n\t" \
+ "addiu %[" #TEMP12 "], %[" #TEMP12 "], 12000 \n\t" \
+ "addiu %[" #TEMP8 "], %[" #TEMP8 "], 21000 \n\t" \
+ "subu %[" #TEMP8 "], %[" #TEMP8 "], %[temp18] \n\t" \
+ "sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
+ "sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
+ "addiu %[temp16], %[" #TEMP12 "], 1 \n\t" \
+ "movn %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
+ "sh %[" #TEMP0 "], " #A "(%[temp20]) \n\t" \
+ "sh %[" #TEMP4 "], " #C "(%[temp20]) \n\t" \
+ "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
+ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
+
+static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
+ const int c2217 = 2217;
+ const int c5352 = 5352;
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+ int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+ int temp17, temp18, temp19, temp20;
+ const int* const args[3] =
+ { (const int*)src, (const int*)ref, (const int*)out };
+
+ __asm__ volatile (
+ HORIZONTAL_PASS(0, temp0, temp1, temp2, temp3)
+ HORIZONTAL_PASS(1, temp4, temp5, temp6, temp7)
+ HORIZONTAL_PASS(2, temp8, temp9, temp10, temp11)
+ HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
+ "lw %[temp20], 8(%[args]) \n\t"
+ VERTICAL_PASS(0, 8, 16, 24, temp0, temp4, temp8, temp12)
+ VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9, temp13)
+ VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
+ VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
+ OUTPUT_EARLY_CLOBBER_REGS_18(),
+ [temp0]"=&r"(temp0), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+ : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
+ : "memory", "hi", "lo"
+ );
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+ uint8_t* dst) {
+ int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+ int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+ __asm__ volatile (
+ "ulw %[temp1], 0(%[in]) \n\t"
+ "ulw %[temp2], 16(%[in]) \n\t"
+ LOAD_IN_X2(temp5, temp6, 24, 26)
+ ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+ LOAD_IN_X2(temp1, temp2, 8, 10)
+ MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+ temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+ temp13, temp11, temp14, temp12)
+ INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+ "ulw %[temp17], 4(%[in]) \n\t"
+ "ulw %[temp18], 20(%[in]) \n\t"
+ ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+ ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+ ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+ LOAD_IN_X2(temp17, temp18, 12, 14)
+ LOAD_IN_X2(temp9, temp10, 28, 30)
+ MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+ temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+ temp15, temp4, temp16, temp17)
+ INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+ ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+ ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+ // horizontal
+ SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+ INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+ SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+ "repl.ph %[temp2], 0x4 \n\t"
+ INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+ "addq.ph %[temp1], %[temp1], %[temp2] \n\t"
+ "addq.ph %[temp6], %[temp6], %[temp2] \n\t"
+ ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+ ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+ MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+ temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+ temp6, temp17, temp8, temp18)
+ MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+ temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+ temp18, temp12, temp17, temp16)
+ INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+ INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+ SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+ temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+ temp6)
+ PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+ temp16, temp11, temp10, temp15, temp14)
+ LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3,
+ BPS)
+ CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+ temp11, temp10, temp11, temp14, temp15)
+ STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+ temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+ dst, 0, 1, 2, 3, BPS)
+
+ OUTPUT_EARLY_CLOBBER_REGS_18()
+ : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
+ : "memory", "hi", "lo"
+ );
+}
+
+static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
+ uint8_t* dst, int do_two) {
+ ITransformOne(ref, in, dst);
+ if (do_two) {
+ ITransformOne(ref + 4, in + 16, dst + 4);
+ }
+}
+
+static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+ int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
+
+ __asm__ volatile (
+ LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3,
+ BPS)
+ CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
+ temp12, temp1, temp2, temp3, temp4)
+ ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+ temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+ PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+ temp7, temp2, temp4, temp6, temp8)
+ ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+ temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+ ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+ temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+ ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+ temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+ ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+ LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+ 0, 4, 8, 12,
+ 0, 0, 0, 0,
+ 0)
+ LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+ 0, 4, 8, 12,
+ 1, 1, 1, 1,
+ 16)
+ MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+ temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+ LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3,
+ BPS)
+ CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
+ temp12, temp1, temp2, temp3, temp4)
+ ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+ temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+ PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+ temp7, temp2, temp4, temp6, temp8)
+ ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+ temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+ ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+ temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+ ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+ temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+ ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+ LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+ 0, 4, 8, 12,
+ 0, 0, 0, 0,
+ 0)
+ LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+ 0, 4, 8, 12,
+ 1, 1, 1, 1,
+ 16)
+ MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+ temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+ OUTPUT_EARLY_CLOBBER_REGS_17()
+ : [a]"r"(a), [b]"r"(b), [w]"r"(w)
+ : "memory", "hi", "lo"
+ );
+ return abs(temp3 - temp17) >> 5;
+}
+
+static int Disto16x16_MIPSdspR2(const uint8_t* const a,
+ const uint8_t* const b,
+ const uint16_t* const w) {
+ int D = 0;
+ int x, y;
+ for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+ for (x = 0; x < 16; x += 4) {
+ D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
+ }
+ }
+ return D;
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+#define FILL_PART(J, SIZE) \
+ "usw %[value], 0+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \
+ "usw %[value], 4+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \
+ ".if " #SIZE " == 16 \n\t" \
+ "usw %[value], 8+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \
+ "usw %[value], 12+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \
+ ".endif \n\t"
+
+#define FILL_8_OR_16(DST, VALUE, SIZE) do { \
+ int value = (VALUE); \
+ __asm__ volatile ( \
+ "replv.qb %[value], %[value] \n\t" \
+ FILL_PART( 0, SIZE) \
+ FILL_PART( 1, SIZE) \
+ FILL_PART( 2, SIZE) \
+ FILL_PART( 3, SIZE) \
+ FILL_PART( 4, SIZE) \
+ FILL_PART( 5, SIZE) \
+ FILL_PART( 6, SIZE) \
+ FILL_PART( 7, SIZE) \
+ ".if " #SIZE " == 16 \n\t" \
+ FILL_PART( 8, 16) \
+ FILL_PART( 9, 16) \
+ FILL_PART(10, 16) \
+ FILL_PART(11, 16) \
+ FILL_PART(12, 16) \
+ FILL_PART(13, 16) \
+ FILL_PART(14, 16) \
+ FILL_PART(15, 16) \
+ ".endif \n\t" \
+ : [value]"+&r"(value) \
+ : [dst]"r"((DST)) \
+ : "memory" \
+ ); \
+} while (0)
+
+#define VERTICAL_PRED(DST, TOP, SIZE) \
+static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST), \
+ const uint8_t* (TOP)) { \
+ int j; \
+ if ((TOP)) { \
+ for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE)); \
+ } else { \
+ FILL_8_OR_16((DST), 127, (SIZE)); \
+ } \
+}
+
+VERTICAL_PRED(dst, top, 8)
+VERTICAL_PRED(dst, top, 16)
+
+#undef VERTICAL_PRED
+
+#define HORIZONTAL_PRED(DST, LEFT, SIZE) \
+static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST), \
+ const uint8_t* (LEFT)) { \
+ if (LEFT) { \
+ int j; \
+ for (j = 0; j < (SIZE); ++j) { \
+ memset((DST) + j * BPS, (LEFT)[j], (SIZE)); \
+ } \
+ } else { \
+ FILL_8_OR_16((DST), 129, (SIZE)); \
+ } \
+}
+
+HORIZONTAL_PRED(dst, left, 8)
+HORIZONTAL_PRED(dst, left, 16)
+
+#undef HORIZONTAL_PRED
+
+#define CLIPPING() \
+ "preceu.ph.qbl %[temp2], %[temp0] \n\t" \
+ "preceu.ph.qbr %[temp0], %[temp0] \n\t" \
+ "preceu.ph.qbl %[temp3], %[temp1] \n\t" \
+ "preceu.ph.qbr %[temp1], %[temp1] \n\t" \
+ "addu.ph %[temp2], %[temp2], %[leftY_1] \n\t" \
+ "addu.ph %[temp0], %[temp0], %[leftY_1] \n\t" \
+ "addu.ph %[temp3], %[temp3], %[leftY_1] \n\t" \
+ "addu.ph %[temp1], %[temp1], %[leftY_1] \n\t" \
+ "shll_s.ph %[temp2], %[temp2], 7 \n\t" \
+ "shll_s.ph %[temp0], %[temp0], 7 \n\t" \
+ "shll_s.ph %[temp3], %[temp3], 7 \n\t" \
+ "shll_s.ph %[temp1], %[temp1], 7 \n\t" \
+ "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \
+ "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t"
+
+#define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do { \
+ int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y]; \
+ int temp0, temp1, temp2, temp3; \
+ __asm__ volatile ( \
+ "replv.ph %[leftY_1], %[leftY_1] \n\t" \
+ "ulw %[temp0], 0(%[top]) \n\t" \
+ "ulw %[temp1], 4(%[top]) \n\t" \
+ "subu.ph %[leftY_1], %[leftY_1], %[left_1] \n\t" \
+ CLIPPING() \
+ "usw %[temp0], 0(%[dst]) \n\t" \
+ "usw %[temp1], 4(%[dst]) \n\t" \
+ ".if " #SIZE " == 16 \n\t" \
+ "ulw %[temp0], 8(%[top]) \n\t" \
+ "ulw %[temp1], 12(%[top]) \n\t" \
+ CLIPPING() \
+ "usw %[temp0], 8(%[dst]) \n\t" \
+ "usw %[temp1], 12(%[dst]) \n\t" \
+ ".endif \n\t" \
+ : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
+ : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST)) \
+ : "memory" \
+ ); \
+} while (0)
+
+#define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do { \
+ int y; \
+ const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1]; \
+ for (y = 0; y < (SIZE); ++y) { \
+ CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE)); \
+ (DST) += BPS; \
+ } \
+} while (0)
+
+#define TRUE_MOTION(DST, LEFT, TOP, SIZE) \
+static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
+ const uint8_t* (TOP)) { \
+ if ((LEFT) != NULL) { \
+ if ((TOP) != NULL) { \
+ CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE)); \
+ } else { \
+ HorizontalPred##SIZE((DST), (LEFT)); \
+ } \
+ } else { \
+ /* true motion without left samples (hence: with default 129 value) */ \
+ /* is equivalent to VE prediction where you just copy the top samples. */ \
+ /* Note that if top samples are not available, the default value is */ \
+ /* then 129, and not 127 as in the VerticalPred case. */ \
+ if ((TOP) != NULL) { \
+ VerticalPred##SIZE((DST), (TOP)); \
+ } else { \
+ FILL_8_OR_16((DST), 129, (SIZE)); \
+ } \
+ } \
+}
+
+TRUE_MOTION(dst, left, top, 8)
+TRUE_MOTION(dst, left, top, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ int DC, DC1;
+ int temp0, temp1, temp2, temp3;
+
+ __asm__ volatile(
+ "beqz %[top], 2f \n\t"
+ LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top,
+ 0, 4, 8, 12,
+ 0, 0, 0, 0,
+ 0)
+ "raddu.w.qb %[temp0], %[temp0] \n\t"
+ "raddu.w.qb %[temp1], %[temp1] \n\t"
+ "raddu.w.qb %[temp2], %[temp2] \n\t"
+ "raddu.w.qb %[temp3], %[temp3] \n\t"
+ "addu %[temp0], %[temp0], %[temp1] \n\t"
+ "addu %[temp2], %[temp2], %[temp3] \n\t"
+ "addu %[DC], %[temp0], %[temp2] \n\t"
+ "move %[DC1], %[DC] \n\t"
+ "beqz %[left], 1f \n\t"
+ LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
+ 0, 4, 8, 12,
+ 0, 0, 0, 0,
+ 0)
+ "raddu.w.qb %[temp0], %[temp0] \n\t"
+ "raddu.w.qb %[temp1], %[temp1] \n\t"
+ "raddu.w.qb %[temp2], %[temp2] \n\t"
+ "raddu.w.qb %[temp3], %[temp3] \n\t"
+ "addu %[temp0], %[temp0], %[temp1] \n\t"
+ "addu %[temp2], %[temp2], %[temp3] \n\t"
+ "addu %[DC1], %[temp0], %[temp2] \n\t"
+ "1: \n\t"
+ "addu %[DC], %[DC], %[DC1] \n\t"
+ "j 3f \n\t"
+ "2: \n\t"
+ "beqz %[left], 4f \n\t"
+ LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
+ 0, 4, 8, 12,
+ 0, 0, 0, 0,
+ 0)
+ "raddu.w.qb %[temp0], %[temp0] \n\t"
+ "raddu.w.qb %[temp1], %[temp1] \n\t"
+ "raddu.w.qb %[temp2], %[temp2] \n\t"
+ "raddu.w.qb %[temp3], %[temp3] \n\t"
+ "addu %[temp0], %[temp0], %[temp1] \n\t"
+ "addu %[temp2], %[temp2], %[temp3] \n\t"
+ "addu %[DC], %[temp0], %[temp2] \n\t"
+ "addu %[DC], %[DC], %[DC] \n\t"
+ "3: \n\t"
+ "shra_r.w %[DC], %[DC], 5 \n\t"
+ "j 5f \n\t"
+ "4: \n\t"
+ "li %[DC], 0x80 \n\t"
+ "5: \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
+ : [left]"r"(left), [top]"r"(top)
+ : "memory"
+ );
+
+ FILL_8_OR_16(dst, DC, 16);
+}
+
+static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ int DC, DC1;
+ int temp0, temp1, temp2, temp3;
+
+ __asm__ volatile(
+ "beqz %[top], 2f \n\t"
+ "ulw %[temp0], 0(%[top]) \n\t"
+ "ulw %[temp1], 4(%[top]) \n\t"
+ "raddu.w.qb %[temp0], %[temp0] \n\t"
+ "raddu.w.qb %[temp1], %[temp1] \n\t"
+ "addu %[DC], %[temp0], %[temp1] \n\t"
+ "move %[DC1], %[DC] \n\t"
+ "beqz %[left], 1f \n\t"
+ "ulw %[temp2], 0(%[left]) \n\t"
+ "ulw %[temp3], 4(%[left]) \n\t"
+ "raddu.w.qb %[temp2], %[temp2] \n\t"
+ "raddu.w.qb %[temp3], %[temp3] \n\t"
+ "addu %[DC1], %[temp2], %[temp3] \n\t"
+ "1: \n\t"
+ "addu %[DC], %[DC], %[DC1] \n\t"
+ "j 3f \n\t"
+ "2: \n\t"
+ "beqz %[left], 4f \n\t"
+ "ulw %[temp2], 0(%[left]) \n\t"
+ "ulw %[temp3], 4(%[left]) \n\t"
+ "raddu.w.qb %[temp2], %[temp2] \n\t"
+ "raddu.w.qb %[temp3], %[temp3] \n\t"
+ "addu %[DC], %[temp2], %[temp3] \n\t"
+ "addu %[DC], %[DC], %[DC] \n\t"
+ "3: \n\t"
+ "shra_r.w %[DC], %[DC], 4 \n\t"
+ "j 5f \n\t"
+ "4: \n\t"
+ "li %[DC], 0x80 \n\t"
+ "5: \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
+ : [left]"r"(left), [top]"r"(top)
+ : "memory"
+ );
+
+ FILL_8_OR_16(dst, DC, 8);
+}
+
+static void DC4(uint8_t* dst, const uint8_t* top) {
+ int temp0, temp1;
+ __asm__ volatile(
+ "ulw %[temp0], 0(%[top]) \n\t"
+ "ulw %[temp1], -5(%[top]) \n\t"
+ "raddu.w.qb %[temp0], %[temp0] \n\t"
+ "raddu.w.qb %[temp1], %[temp1] \n\t"
+ "addu %[temp0], %[temp0], %[temp1] \n\t"
+ "addiu %[temp0], %[temp0], 4 \n\t"
+ "srl %[temp0], %[temp0], 3 \n\t"
+ "replv.qb %[temp0], %[temp0] \n\t"
+ "usw %[temp0], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp0], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp0], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp0], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void TM4(uint8_t* dst, const uint8_t* top) {
+ int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
+ const int c35 = 0xff00ff;
+ __asm__ volatile (
+ "lbu %[temp1], 0(%[top]) \n\t"
+ "lbu %[a10], 1(%[top]) \n\t"
+ "lbu %[temp2], 2(%[top]) \n\t"
+ "lbu %[a32], 3(%[top]) \n\t"
+ "ulw %[temp0], -5(%[top]) \n\t"
+ "lbu %[temp4], -1(%[top]) \n\t"
+ "append %[a10], %[temp1], 16 \n\t"
+ "append %[a32], %[temp2], 16 \n\t"
+ "replv.ph %[temp4], %[temp4] \n\t"
+ "shrl.ph %[temp1], %[temp0], 8 \n\t"
+ "and %[temp0], %[temp0], %[c35] \n\t"
+ "subu.ph %[temp1], %[temp1], %[temp4] \n\t"
+ "subu.ph %[temp0], %[temp0], %[temp4] \n\t"
+ "srl %[temp2], %[temp1], 16 \n\t"
+ "srl %[temp3], %[temp0], 16 \n\t"
+ "replv.ph %[temp2], %[temp2] \n\t"
+ "replv.ph %[temp3], %[temp3] \n\t"
+ "replv.ph %[temp4], %[temp1] \n\t"
+ "replv.ph %[temp5], %[temp0] \n\t"
+ "addu.ph %[temp0], %[temp3], %[a10] \n\t"
+ "addu.ph %[temp1], %[temp3], %[a32] \n\t"
+ "addu.ph %[temp3], %[temp2], %[a10] \n\t"
+ "addu.ph %[temp2], %[temp2], %[a32] \n\t"
+ "shll_s.ph %[temp0], %[temp0], 7 \n\t"
+ "shll_s.ph %[temp1], %[temp1], 7 \n\t"
+ "shll_s.ph %[temp3], %[temp3], 7 \n\t"
+ "shll_s.ph %[temp2], %[temp2], 7 \n\t"
+ "precrqu_s.qb.ph %[temp0], %[temp1], %[temp0] \n\t"
+ "precrqu_s.qb.ph %[temp1], %[temp2], %[temp3] \n\t"
+ "addu.ph %[temp2], %[temp5], %[a10] \n\t"
+ "addu.ph %[temp3], %[temp5], %[a32] \n\t"
+ "addu.ph %[temp5], %[temp4], %[a10] \n\t"
+ "addu.ph %[temp4], %[temp4], %[a32] \n\t"
+ "shll_s.ph %[temp2], %[temp2], 7 \n\t"
+ "shll_s.ph %[temp3], %[temp3], 7 \n\t"
+ "shll_s.ph %[temp4], %[temp4], 7 \n\t"
+ "shll_s.ph %[temp5], %[temp5], 7 \n\t"
+ "precrqu_s.qb.ph %[temp2], %[temp3], %[temp2] \n\t"
+ "precrqu_s.qb.ph %[temp3], %[temp4], %[temp5] \n\t"
+ "usw %[temp1], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp0], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp3], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp2], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [a10]"=&r"(a10), [a32]"=&r"(a32)
+ : [c35]"r"(c35), [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void VE4(uint8_t* dst, const uint8_t* top) {
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+ __asm__ volatile(
+ "ulw %[temp0], -1(%[top]) \n\t"
+ "ulh %[temp1], 3(%[top]) \n\t"
+ "preceu.ph.qbr %[temp2], %[temp0] \n\t"
+ "preceu.ph.qbl %[temp3], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp4], %[temp1] \n\t"
+ "packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
+ "packrl.ph %[temp6], %[temp4], %[temp3] \n\t"
+ "shll.ph %[temp5], %[temp5], 1 \n\t"
+ "shll.ph %[temp6], %[temp6], 1 \n\t"
+ "addq.ph %[temp2], %[temp5], %[temp2] \n\t"
+ "addq.ph %[temp6], %[temp6], %[temp4] \n\t"
+ "addq.ph %[temp2], %[temp2], %[temp3] \n\t"
+ "addq.ph %[temp6], %[temp6], %[temp3] \n\t"
+ "shra_r.ph %[temp2], %[temp2], 2 \n\t"
+ "shra_r.ph %[temp6], %[temp6], 2 \n\t"
+ "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"
+ "usw %[temp4], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp4], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp4], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp4], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void HE4(uint8_t* dst, const uint8_t* top) {
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+ __asm__ volatile(
+ "ulw %[temp0], -4(%[top]) \n\t"
+ "lbu %[temp1], -5(%[top]) \n\t"
+ "preceu.ph.qbr %[temp2], %[temp0] \n\t"
+ "preceu.ph.qbl %[temp3], %[temp0] \n\t"
+ "replv.ph %[temp4], %[temp1] \n\t"
+ "packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
+ "packrl.ph %[temp6], %[temp2], %[temp4] \n\t"
+ "shll.ph %[temp5], %[temp5], 1 \n\t"
+ "shll.ph %[temp6], %[temp6], 1 \n\t"
+ "addq.ph %[temp3], %[temp3], %[temp5] \n\t"
+ "addq.ph %[temp3], %[temp3], %[temp2] \n\t"
+ "addq.ph %[temp2], %[temp2], %[temp6] \n\t"
+ "addq.ph %[temp2], %[temp2], %[temp4] \n\t"
+ "shra_r.ph %[temp3], %[temp3], 2 \n\t"
+ "shra_r.ph %[temp2], %[temp2], 2 \n\t"
+ "replv.qb %[temp0], %[temp3] \n\t"
+ "replv.qb %[temp1], %[temp2] \n\t"
+ "srl %[temp3], %[temp3], 16 \n\t"
+ "srl %[temp2], %[temp2], 16 \n\t"
+ "replv.qb %[temp3], %[temp3] \n\t"
+ "replv.qb %[temp2], %[temp2] \n\t"
+ "usw %[temp3], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp0], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp2], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp1], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void RD4(uint8_t* dst, const uint8_t* top) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ int temp6, temp7, temp8, temp9, temp10, temp11;
+ __asm__ volatile(
+ "ulw %[temp0], -5(%[top]) \n\t"
+ "ulw %[temp1], -1(%[top]) \n\t"
+ "preceu.ph.qbl %[temp2], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp3], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp4], %[temp1] \n\t"
+ "preceu.ph.qbl %[temp5], %[temp1] \n\t"
+ "packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
+ "packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
+ "packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
+ "shll.ph %[temp6], %[temp6], 1 \n\t"
+ "addq.ph %[temp9], %[temp2], %[temp6] \n\t"
+ "shll.ph %[temp7], %[temp7], 1 \n\t"
+ "addq.ph %[temp9], %[temp9], %[temp3] \n\t"
+ "shll.ph %[temp8], %[temp8], 1 \n\t"
+ "shra_r.ph %[temp9], %[temp9], 2 \n\t"
+ "addq.ph %[temp10], %[temp4], %[temp7] \n\t"
+ "addq.ph %[temp11], %[temp5], %[temp8] \n\t"
+ "addq.ph %[temp10], %[temp10], %[temp2] \n\t"
+ "addq.ph %[temp11], %[temp11], %[temp4] \n\t"
+ "shra_r.ph %[temp10], %[temp10], 2 \n\t"
+ "shra_r.ph %[temp11], %[temp11], 2 \n\t"
+ "lbu %[temp0], 3(%[top]) \n\t"
+ "lbu %[temp1], 2(%[top]) \n\t"
+ "lbu %[temp2], 1(%[top]) \n\t"
+ "sll %[temp1], %[temp1], 1 \n\t"
+ "addu %[temp0], %[temp0], %[temp1] \n\t"
+ "addu %[temp0], %[temp0], %[temp2] \n\t"
+ "precr.qb.ph %[temp9], %[temp10], %[temp9] \n\t"
+ "shra_r.w %[temp0], %[temp0], 2 \n\t"
+ "precr.qb.ph %[temp10], %[temp11], %[temp10] \n\t"
+ "usw %[temp9], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp10], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ "prepend %[temp9], %[temp11], 8 \n\t"
+ "prepend %[temp10], %[temp0], 8 \n\t"
+ "usw %[temp9], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp10], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void VR4(uint8_t* dst, const uint8_t* top) {
+ int temp0, temp1, temp2, temp3, temp4;
+ int temp5, temp6, temp7, temp8, temp9;
+ __asm__ volatile (
+ "ulw %[temp0], -4(%[top]) \n\t"
+ "ulw %[temp1], 0(%[top]) \n\t"
+ "preceu.ph.qbl %[temp2], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp0], %[temp0] \n\t"
+ "preceu.ph.qbla %[temp3], %[temp1] \n\t"
+ "preceu.ph.qbra %[temp1], %[temp1] \n\t"
+ "packrl.ph %[temp7], %[temp3], %[temp2] \n\t"
+ "addqh_r.ph %[temp4], %[temp1], %[temp3] \n\t"
+ "move %[temp6], %[temp1] \n\t"
+ "append %[temp1], %[temp2], 16 \n\t"
+ "shll.ph %[temp9], %[temp6], 1 \n\t"
+ "addqh_r.ph %[temp5], %[temp7], %[temp6] \n\t"
+ "shll.ph %[temp8], %[temp7], 1 \n\t"
+ "addu.ph %[temp3], %[temp7], %[temp3] \n\t"
+ "addu.ph %[temp1], %[temp1], %[temp6] \n\t"
+ "packrl.ph %[temp7], %[temp2], %[temp0] \n\t"
+ "addu.ph %[temp6], %[temp0], %[temp2] \n\t"
+ "addu.ph %[temp3], %[temp3], %[temp9] \n\t"
+ "addu.ph %[temp1], %[temp1], %[temp8] \n\t"
+ "shll.ph %[temp7], %[temp7], 1 \n\t"
+ "shra_r.ph %[temp3], %[temp3], 2 \n\t"
+ "shra_r.ph %[temp1], %[temp1], 2 \n\t"
+ "addu.ph %[temp6], %[temp6], %[temp7] \n\t"
+ "shra_r.ph %[temp6], %[temp6], 2 \n\t"
+ "precrq.ph.w %[temp8], %[temp4], %[temp5] \n\t"
+ "append %[temp4], %[temp5], 16 \n\t"
+ "precrq.ph.w %[temp2], %[temp3], %[temp1] \n\t"
+ "append %[temp3], %[temp1], 16 \n\t"
+ "precr.qb.ph %[temp8], %[temp8], %[temp4] \n\t"
+ "precr.qb.ph %[temp3], %[temp2], %[temp3] \n\t"
+ "usw %[temp8], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp3], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ "append %[temp3], %[temp6], 8 \n\t"
+ "srl %[temp6], %[temp6], 16 \n\t"
+ "append %[temp8], %[temp6], 8 \n\t"
+ "usw %[temp3], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp8], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void LD4(uint8_t* dst, const uint8_t* top) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ int temp6, temp7, temp8, temp9, temp10, temp11;
+ __asm__ volatile(
+ "ulw %[temp0], 0(%[top]) \n\t"
+ "ulw %[temp1], 4(%[top]) \n\t"
+ "preceu.ph.qbl %[temp2], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp3], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp4], %[temp1] \n\t"
+ "preceu.ph.qbl %[temp5], %[temp1] \n\t"
+ "packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
+ "packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
+ "packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
+ "shll.ph %[temp6], %[temp6], 1 \n\t"
+ "addq.ph %[temp9], %[temp2], %[temp6] \n\t"
+ "shll.ph %[temp7], %[temp7], 1 \n\t"
+ "addq.ph %[temp9], %[temp9], %[temp3] \n\t"
+ "shll.ph %[temp8], %[temp8], 1 \n\t"
+ "shra_r.ph %[temp9], %[temp9], 2 \n\t"
+ "addq.ph %[temp10], %[temp4], %[temp7] \n\t"
+ "addq.ph %[temp11], %[temp5], %[temp8] \n\t"
+ "addq.ph %[temp10], %[temp10], %[temp2] \n\t"
+ "addq.ph %[temp11], %[temp11], %[temp4] \n\t"
+ "shra_r.ph %[temp10], %[temp10], 2 \n\t"
+ "shra_r.ph %[temp11], %[temp11], 2 \n\t"
+ "srl %[temp1], %[temp1], 24 \n\t"
+ "sll %[temp1], %[temp1], 1 \n\t"
+ "raddu.w.qb %[temp5], %[temp5] \n\t"
+ "precr.qb.ph %[temp9], %[temp10], %[temp9] \n\t"
+ "precr.qb.ph %[temp10], %[temp11], %[temp10] \n\t"
+ "addu %[temp1], %[temp1], %[temp5] \n\t"
+ "shra_r.w %[temp1], %[temp1], 2 \n\t"
+ "usw %[temp9], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp10], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ "prepend %[temp9], %[temp11], 8 \n\t"
+ "prepend %[temp10], %[temp1], 8 \n\t"
+ "usw %[temp9], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp10], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void VL4(uint8_t* dst, const uint8_t* top) {
+ int temp0, temp1, temp2, temp3, temp4;
+ int temp5, temp6, temp7, temp8, temp9;
+ __asm__ volatile (
+ "ulw %[temp0], 0(%[top]) \n\t"
+ "ulw %[temp1], 4(%[top]) \n\t"
+ "preceu.ph.qbla %[temp2], %[temp0] \n\t"
+ "preceu.ph.qbra %[temp0], %[temp0] \n\t"
+ "preceu.ph.qbl %[temp3], %[temp1] \n\t"
+ "preceu.ph.qbr %[temp1], %[temp1] \n\t"
+ "addqh_r.ph %[temp4], %[temp0], %[temp2] \n\t"
+ "packrl.ph %[temp7], %[temp1], %[temp0] \n\t"
+ "precrq.ph.w %[temp6], %[temp1], %[temp2] \n\t"
+ "shll.ph %[temp9], %[temp2], 1 \n\t"
+ "addqh_r.ph %[temp5], %[temp7], %[temp2] \n\t"
+ "shll.ph %[temp8], %[temp7], 1 \n\t"
+ "addu.ph %[temp2], %[temp2], %[temp6] \n\t"
+ "addu.ph %[temp0], %[temp0], %[temp7] \n\t"
+ "packrl.ph %[temp7], %[temp3], %[temp1] \n\t"
+ "addu.ph %[temp6], %[temp1], %[temp3] \n\t"
+ "addu.ph %[temp2], %[temp2], %[temp8] \n\t"
+ "addu.ph %[temp0], %[temp0], %[temp9] \n\t"
+ "shll.ph %[temp7], %[temp7], 1 \n\t"
+ "shra_r.ph %[temp2], %[temp2], 2 \n\t"
+ "shra_r.ph %[temp0], %[temp0], 2 \n\t"
+ "addu.ph %[temp6], %[temp6], %[temp7] \n\t"
+ "shra_r.ph %[temp6], %[temp6], 2 \n\t"
+ "precrq.ph.w %[temp8], %[temp5], %[temp4] \n\t"
+ "append %[temp5], %[temp4], 16 \n\t"
+ "precrq.ph.w %[temp3], %[temp2], %[temp0] \n\t"
+ "append %[temp2], %[temp0], 16 \n\t"
+ "precr.qb.ph %[temp8], %[temp8], %[temp5] \n\t"
+ "precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t"
+ "usw %[temp8], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ "prepend %[temp8], %[temp6], 8 \n\t"
+ "usw %[temp3], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ "srl %[temp6], %[temp6], 16 \n\t"
+ "prepend %[temp3], %[temp6], 8 \n\t"
+ "usw %[temp8], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp3], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void HD4(uint8_t* dst, const uint8_t* top) {
+ int temp0, temp1, temp2, temp3, temp4;
+ int temp5, temp6, temp7, temp8, temp9;
+ __asm__ volatile (
+ "ulw %[temp0], -5(%[top]) \n\t"
+ "ulw %[temp1], -1(%[top]) \n\t"
+ "preceu.ph.qbla %[temp2], %[temp0] \n\t"
+ "preceu.ph.qbra %[temp0], %[temp0] \n\t"
+ "preceu.ph.qbl %[temp3], %[temp1] \n\t"
+ "preceu.ph.qbr %[temp1], %[temp1] \n\t"
+ "addqh_r.ph %[temp4], %[temp0], %[temp2] \n\t"
+ "packrl.ph %[temp7], %[temp1], %[temp0] \n\t"
+ "precrq.ph.w %[temp6], %[temp1], %[temp2] \n\t"
+ "shll.ph %[temp9], %[temp2], 1 \n\t"
+ "addqh_r.ph %[temp5], %[temp7], %[temp2] \n\t"
+ "shll.ph %[temp8], %[temp7], 1 \n\t"
+ "addu.ph %[temp2], %[temp2], %[temp6] \n\t"
+ "addu.ph %[temp0], %[temp0], %[temp7] \n\t"
+ "packrl.ph %[temp7], %[temp3], %[temp1] \n\t"
+ "addu.ph %[temp6], %[temp1], %[temp3] \n\t"
+ "addu.ph %[temp2], %[temp2], %[temp8] \n\t"
+ "addu.ph %[temp0], %[temp0], %[temp9] \n\t"
+ "shll.ph %[temp7], %[temp7], 1 \n\t"
+ "shra_r.ph %[temp2], %[temp2], 2 \n\t"
+ "shra_r.ph %[temp0], %[temp0], 2 \n\t"
+ "addu.ph %[temp6], %[temp6], %[temp7] \n\t"
+ "shra_r.ph %[temp6], %[temp6], 2 \n\t"
+ "precrq.ph.w %[temp1], %[temp2], %[temp5] \n\t"
+ "precrq.ph.w %[temp3], %[temp0], %[temp4] \n\t"
+ "precr.qb.ph %[temp7], %[temp6], %[temp1] \n\t"
+ "precr.qb.ph %[temp6], %[temp1], %[temp3] \n\t"
+ "usw %[temp7], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp6], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ "append %[temp2], %[temp5], 16 \n\t"
+ "append %[temp0], %[temp4], 16 \n\t"
+ "precr.qb.ph %[temp5], %[temp3], %[temp2] \n\t"
+ "precr.qb.ph %[temp4], %[temp2], %[temp0] \n\t"
+ "usw %[temp5], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp4], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+static void HU4(uint8_t* dst, const uint8_t* top) {
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ __asm__ volatile (
+ "ulw %[temp0], -5(%[top]) \n\t"
+ "preceu.ph.qbl %[temp1], %[temp0] \n\t"
+ "preceu.ph.qbr %[temp2], %[temp0] \n\t"
+ "packrl.ph %[temp3], %[temp1], %[temp2] \n\t"
+ "replv.qb %[temp7], %[temp2] \n\t"
+ "addqh_r.ph %[temp4], %[temp1], %[temp3] \n\t"
+ "addqh_r.ph %[temp5], %[temp3], %[temp2] \n\t"
+ "shll.ph %[temp6], %[temp3], 1 \n\t"
+ "addu.ph %[temp3], %[temp2], %[temp3] \n\t"
+ "addu.ph %[temp6], %[temp1], %[temp6] \n\t"
+ "shll.ph %[temp0], %[temp2], 1 \n\t"
+ "addu.ph %[temp6], %[temp6], %[temp2] \n\t"
+ "addu.ph %[temp0], %[temp3], %[temp0] \n\t"
+ "shra_r.ph %[temp6], %[temp6], 2 \n\t"
+ "shra_r.ph %[temp0], %[temp0], 2 \n\t"
+ "packrl.ph %[temp3], %[temp6], %[temp5] \n\t"
+ "precrq.ph.w %[temp2], %[temp6], %[temp4] \n\t"
+ "append %[temp0], %[temp5], 16 \n\t"
+ "precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t"
+ "usw %[temp3], 0*" XSTR(BPS) "(%[dst]) \n\t"
+ "precr.qb.ph %[temp1], %[temp7], %[temp0] \n\t"
+ "usw %[temp7], 3*" XSTR(BPS) "(%[dst]) \n\t"
+ "packrl.ph %[temp2], %[temp1], %[temp3] \n\t"
+ "usw %[temp1], 2*" XSTR(BPS) "(%[dst]) \n\t"
+ "usw %[temp2], 1*" XSTR(BPS) "(%[dst]) \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+ : [top]"r"(top), [dst]"r"(dst)
+ : "memory"
+ );
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ // U block
+ DCMode8(C8DC8 + dst, left, top);
+ VerticalPred8(C8VE8 + dst, top);
+ HorizontalPred8(C8HE8 + dst, left);
+ TrueMotion8(C8TM8 + dst, left, top);
+ // V block
+ dst += 8;
+ if (top) top += 8;
+ if (left) left += 16;
+ DCMode8(C8DC8 + dst, left, top);
+ VerticalPred8(C8VE8 + dst, top);
+ HorizontalPred8(C8HE8 + dst, left);
+ TrueMotion8(C8TM8 + dst, left, top);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds_MIPSdspR2(uint8_t* dst,
+ const uint8_t* left, const uint8_t* top) {
+ DCMode16(I16DC16 + dst, left, top);
+ VerticalPred16(I16VE16 + dst, top);
+ HorizontalPred16(I16HE16 + dst, left);
+ TrueMotion16(I16TM16 + dst, left, top);
+}
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
+ DC4(I4DC4 + dst, top);
+ TM4(I4TM4 + dst, top);
+ VE4(I4VE4 + dst, top);
+ HE4(I4HE4 + dst, top);
+ RD4(I4RD4 + dst, top);
+ VR4(I4VR4 + dst, top);
+ LD4(I4LD4 + dst, top);
+ VL4(I4VL4 + dst, top);
+ HD4(I4HD4 + dst, top);
+ HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#if !defined(WORK_AROUND_GCC)
+
+#define GET_SSE_INNER(A) \
+ "lw %[temp0], " #A "(%[a]) \n\t" \
+ "lw %[temp1], " #A "(%[b]) \n\t" \
+ "preceu.ph.qbr %[temp2], %[temp0] \n\t" \
+ "preceu.ph.qbl %[temp0], %[temp0] \n\t" \
+ "preceu.ph.qbr %[temp3], %[temp1] \n\t" \
+ "preceu.ph.qbl %[temp1], %[temp1] \n\t" \
+ "subq.ph %[temp2], %[temp2], %[temp3] \n\t" \
+ "subq.ph %[temp0], %[temp0], %[temp1] \n\t" \
+ "dpa.w.ph $ac0, %[temp2], %[temp2] \n\t" \
+ "dpa.w.ph $ac0, %[temp0], %[temp0] \n\t"
+
+#define GET_SSE(A, B, C, D) \
+ GET_SSE_INNER(A) \
+ GET_SSE_INNER(B) \
+ GET_SSE_INNER(C) \
+ GET_SSE_INNER(D)
+
+static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+ int count;
+ int temp0, temp1, temp2, temp3;
+ __asm__ volatile (
+ "mult $zero, $zero \n\t"
+ GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
+ GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
+ GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
+ GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
+ GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
+ GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
+ GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
+ GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
+ GET_SSE( 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS)
+ GET_SSE( 9 * BPS, 4 + 9 * BPS, 8 + 9 * BPS, 12 + 9 * BPS)
+ GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+ GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+ GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+ GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+ GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+ GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
+ "mflo %[count] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [count]"=&r"(count)
+ : [a]"r"(a), [b]"r"(b)
+ : "memory", "hi", "lo"
+ );
+ return count;
+}
+
+static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+ int count;
+ int temp0, temp1, temp2, temp3;
+ __asm__ volatile (
+ "mult $zero, $zero \n\t"
+ GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
+ GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
+ GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
+ GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
+ GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
+ GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
+ GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
+ GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
+ "mflo %[count] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [count]"=&r"(count)
+ : [a]"r"(a), [b]"r"(b)
+ : "memory", "hi", "lo"
+ );
+ return count;
+}
+
+static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+ int count;
+ int temp0, temp1, temp2, temp3;
+ __asm__ volatile (
+ "mult $zero, $zero \n\t"
+ GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+ GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+ GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+ GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
+ "mflo %[count] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [count]"=&r"(count)
+ : [a]"r"(a), [b]"r"(b)
+ : "memory", "hi", "lo"
+ );
+ return count;
+}
+
+static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+ int count;
+ int temp0, temp1, temp2, temp3;
+ __asm__ volatile (
+ "mult $zero, $zero \n\t"
+ GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
+ "mflo %[count] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [count]"=&r"(count)
+ : [a]"r"(a), [b]"r"(b)
+ : "memory", "hi", "lo"
+ );
+ return count;
+}
+
+#undef GET_SSE
+#undef GET_SSE_INNER
+
+#endif // !WORK_AROUND_GCC
+
+#undef FILL_8_OR_16
+#undef FILL_PART
+#undef OUTPUT_EARLY_CLOBBER_REGS_17
+#undef MUL_HALF
+#undef ABS_X8
+#undef ADD_SUB_HALVES_X4
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// macro for one pass through for loop in QuantizeBlock reading 2 values at time
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+// N1 - offset in bytes ((n + 1) * 2)
+#define QUANTIZE_ONE(J, K, N, N1) \
+ "ulw %[temp1], " #J "(%[ppin]) \n\t" \
+ "ulw %[temp2], " #J "(%[ppsharpen]) \n\t" \
+ "lhu %[temp3], " #K "(%[ppzthresh]) \n\t" \
+ "lhu %[temp6], " #K "+4(%[ppzthresh]) \n\t" \
+ "absq_s.ph %[temp4], %[temp1] \n\t" \
+ "ins %[temp3], %[temp6], 16, 16 \n\t" \
+ "addu.ph %[coeff], %[temp4], %[temp2] \n\t" \
+ "shra.ph %[sign], %[temp1], 15 \n\t" \
+ "li %[level], 0x10001 \n\t" \
+ "cmp.lt.ph %[temp3], %[coeff] \n\t" \
+ "lhu %[temp1], " #J "(%[ppiq]) \n\t" \
+ "pick.ph %[temp5], %[level], $0 \n\t" \
+ "lw %[temp2], " #K "(%[ppbias]) \n\t" \
+ "beqz %[temp5], 0f \n\t" \
+ "lhu %[temp3], " #J "(%[ppq]) \n\t" \
+ "beq %[temp5], %[level], 1f \n\t" \
+ "andi %[temp5], %[temp5], 0x1 \n\t" \
+ "andi %[temp4], %[coeff], 0xffff \n\t" \
+ "beqz %[temp5], 2f \n\t" \
+ "mul %[level], %[temp4], %[temp1] \n\t" \
+ "sh $0, " #J "+2(%[ppin]) \n\t" \
+ "sh $0, " #N1 "(%[pout]) \n\t" \
+ "addu %[level], %[level], %[temp2] \n\t" \
+ "sra %[level], %[level], 17 \n\t" \
+ "slt %[temp4], %[max_level], %[level] \n\t" \
+ "movn %[level], %[max_level], %[temp4] \n\t" \
+ "andi %[temp6], %[sign], 0xffff \n\t" \
+ "xor %[level], %[level], %[temp6] \n\t" \
+ "subu %[level], %[level], %[temp6] \n\t" \
+ "mul %[temp5], %[level], %[temp3] \n\t" \
+ "or %[ret], %[ret], %[level] \n\t" \
+ "sh %[level], " #N "(%[pout]) \n\t" \
+ "sh %[temp5], " #J "(%[ppin]) \n\t" \
+ "j 3f \n\t" \
+"2: \n\t" \
+ "lhu %[temp1], " #J "+2(%[ppiq]) \n\t" \
+ "srl %[temp5], %[coeff], 16 \n\t" \
+ "mul %[level], %[temp5], %[temp1] \n\t" \
+ "lw %[temp2], " #K "+4(%[ppbias]) \n\t" \
+ "lhu %[temp3], " #J "+2(%[ppq]) \n\t" \
+ "addu %[level], %[level], %[temp2] \n\t" \
+ "sra %[level], %[level], 17 \n\t" \
+ "srl %[temp6], %[sign], 16 \n\t" \
+ "slt %[temp4], %[max_level], %[level] \n\t" \
+ "movn %[level], %[max_level], %[temp4] \n\t" \
+ "xor %[level], %[level], %[temp6] \n\t" \
+ "subu %[level], %[level], %[temp6] \n\t" \
+ "mul %[temp5], %[level], %[temp3] \n\t" \
+ "sh $0, " #J "(%[ppin]) \n\t" \
+ "sh $0, " #N "(%[pout]) \n\t" \
+ "or %[ret], %[ret], %[level] \n\t" \
+ "sh %[temp5], " #J "+2(%[ppin]) \n\t" \
+ "sh %[level], " #N1 "(%[pout]) \n\t" \
+ "j 3f \n\t" \
+"1: \n\t" \
+ "lhu %[temp1], " #J "(%[ppiq]) \n\t" \
+ "lw %[temp2], " #K "(%[ppbias]) \n\t" \
+ "ulw %[temp3], " #J "(%[ppq]) \n\t" \
+ "andi %[temp5], %[coeff], 0xffff \n\t" \
+ "srl %[temp0], %[coeff], 16 \n\t" \
+ "lhu %[temp6], " #J "+2(%[ppiq]) \n\t" \
+ "lw %[coeff], " #K "+4(%[ppbias]) \n\t" \
+ "mul %[level], %[temp5], %[temp1] \n\t" \
+ "mul %[temp4], %[temp0], %[temp6] \n\t" \
+ "addu %[level], %[level], %[temp2] \n\t" \
+ "addu %[temp4], %[temp4], %[coeff] \n\t" \
+ "precrq.ph.w %[level], %[temp4], %[level] \n\t" \
+ "shra.ph %[level], %[level], 1 \n\t" \
+ "cmp.lt.ph %[max_level1],%[level] \n\t" \
+ "pick.ph %[level], %[max_level], %[level] \n\t" \
+ "xor %[level], %[level], %[sign] \n\t" \
+ "subu.ph %[level], %[level], %[sign] \n\t" \
+ "mul.ph %[temp3], %[level], %[temp3] \n\t" \
+ "or %[ret], %[ret], %[level] \n\t" \
+ "sh %[level], " #N "(%[pout]) \n\t" \
+ "srl %[level], %[level], 16 \n\t" \
+ "sh %[level], " #N1 "(%[pout]) \n\t" \
+ "usw %[temp3], " #J "(%[ppin]) \n\t" \
+ "j 3f \n\t" \
+"0: \n\t" \
+ "sh $0, " #N "(%[pout]) \n\t" \
+ "sh $0, " #N1 "(%[pout]) \n\t" \
+ "usw $0, " #J "(%[ppin]) \n\t" \
+"3: \n\t"
+
+static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
+ int sign, coeff, level;
+ int max_level = MAX_LEVEL;
+ int max_level1 = max_level << 16 | max_level;
+ int ret = 0;
+
+ int16_t* ppin = &in[0];
+ int16_t* pout = &out[0];
+ const uint16_t* ppsharpen = &mtx->sharpen_[0];
+ const uint32_t* ppzthresh = &mtx->zthresh_[0];
+ const uint16_t* ppq = &mtx->q_[0];
+ const uint16_t* ppiq = &mtx->iq_[0];
+ const uint32_t* ppbias = &mtx->bias_[0];
+
+ __asm__ volatile (
+ QUANTIZE_ONE( 0, 0, 0, 2)
+ QUANTIZE_ONE( 4, 8, 10, 12)
+ QUANTIZE_ONE( 8, 16, 4, 8)
+ QUANTIZE_ONE(12, 24, 14, 24)
+ QUANTIZE_ONE(16, 32, 6, 16)
+ QUANTIZE_ONE(20, 40, 22, 26)
+ QUANTIZE_ONE(24, 48, 18, 20)
+ QUANTIZE_ONE(28, 56, 28, 30)
+
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+ [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
+ : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
+ [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+ [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+ [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+ : "memory", "hi", "lo"
+ );
+
+ return (ret != 0);
+}
+
+static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
+ int nz;
+ nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
+ nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
+ return nz;
+}
+
+#undef QUANTIZE_ONE
+
+// macro for one horizontal pass in FTransformWHT
+// temp0..temp7 holds tmp[0]..tmp[15]
+// A, B, C, D - offset in bytes to load from in buffer
+// TEMP0, TEMP1 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1) \
+ "lh %[" #TEMP0 "], " #A "(%[in]) \n\t" \
+ "lh %[" #TEMP1 "], " #B "(%[in]) \n\t" \
+ "lh %[temp8], " #C "(%[in]) \n\t" \
+ "lh %[temp9], " #D "(%[in]) \n\t" \
+ "ins %[" #TEMP1 "], %[" #TEMP0 "], 16, 16 \n\t" \
+ "ins %[temp9], %[temp8], 16, 16 \n\t" \
+ "subq.ph %[temp8], %[" #TEMP1 "], %[temp9] \n\t" \
+ "addq.ph %[temp9], %[" #TEMP1 "], %[temp9] \n\t" \
+ "precrq.ph.w %[" #TEMP0 "], %[temp8], %[temp9] \n\t" \
+ "append %[temp8], %[temp9], 16 \n\t" \
+ "subq.ph %[" #TEMP1 "], %[" #TEMP0 "], %[temp8] \n\t" \
+ "addq.ph %[" #TEMP0 "], %[" #TEMP0 "], %[temp8] \n\t" \
+ "rotr %[" #TEMP1 "], %[" #TEMP1 "], 16 \n\t"
+
+// macro for one vertical pass in FTransformWHT
+// temp0..temp7 holds tmp[0]..tmp[15]
+// A, B, C, D - offsets in bytes to store to out buffer
+// TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements
+#define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6) \
+ "addq.ph %[temp8], %[" #TEMP0 "], %[" #TEMP4 "] \n\t" \
+ "addq.ph %[temp9], %[" #TEMP2 "], %[" #TEMP6 "] \n\t" \
+ "subq.ph %[" #TEMP2 "], %[" #TEMP2 "], %[" #TEMP6 "] \n\t" \
+ "subq.ph %[" #TEMP6 "], %[" #TEMP0 "], %[" #TEMP4 "] \n\t" \
+ "addqh.ph %[" #TEMP0 "], %[temp8], %[temp9] \n\t" \
+ "subqh.ph %[" #TEMP4 "], %[" #TEMP6 "], %[" #TEMP2 "] \n\t" \
+ "addqh.ph %[" #TEMP2 "], %[" #TEMP2 "], %[" #TEMP6 "] \n\t" \
+ "subqh.ph %[" #TEMP6 "], %[temp8], %[temp9] \n\t" \
+ "usw %[" #TEMP0 "], " #A "(%[out]) \n\t" \
+ "usw %[" #TEMP2 "], " #B "(%[out]) \n\t" \
+ "usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \
+ "usw %[" #TEMP6 "], " #D "(%[out]) \n\t"
+
+static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
+ int temp0, temp1, temp2, temp3, temp4;
+ int temp5, temp6, temp7, temp8, temp9;
+
+ __asm__ volatile (
+ HORIZONTAL_PASS_WHT( 0, 32, 64, 96, temp0, temp1)
+ HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3)
+ HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5)
+ HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7)
+ VERTICAL_PASS_WHT(0, 8, 16, 24, temp0, temp2, temp4, temp6)
+ VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7)
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+ [temp9]"=&r"(temp9)
+ : [in]"r"(in), [out]"r"(out)
+ : "memory"
+ );
+}
+
+#undef VERTICAL_PASS_WHT
+#undef HORIZONTAL_PASS_WHT
+
+// macro for converting coefficients to bin
+// convert 8 coeffs at time
+// A, B, C, D - offsets in bytes to load from out buffer
+#define CONVERT_COEFFS_TO_BIN(A, B, C, D) \
+ "ulw %[temp0], " #A "(%[out]) \n\t" \
+ "ulw %[temp1], " #B "(%[out]) \n\t" \
+ "ulw %[temp2], " #C "(%[out]) \n\t" \
+ "ulw %[temp3], " #D "(%[out]) \n\t" \
+ "absq_s.ph %[temp0], %[temp0] \n\t" \
+ "absq_s.ph %[temp1], %[temp1] \n\t" \
+ "absq_s.ph %[temp2], %[temp2] \n\t" \
+ "absq_s.ph %[temp3], %[temp3] \n\t" \
+ "shra.ph %[temp0], %[temp0], 3 \n\t" \
+ "shra.ph %[temp1], %[temp1], 3 \n\t" \
+ "shra.ph %[temp2], %[temp2], 3 \n\t" \
+ "shra.ph %[temp3], %[temp3], 3 \n\t" \
+ "shll_s.ph %[temp0], %[temp0], 10 \n\t" \
+ "shll_s.ph %[temp1], %[temp1], 10 \n\t" \
+ "shll_s.ph %[temp2], %[temp2], 10 \n\t" \
+ "shll_s.ph %[temp3], %[temp3], 10 \n\t" \
+ "shrl.ph %[temp0], %[temp0], 10 \n\t" \
+ "shrl.ph %[temp1], %[temp1], 10 \n\t" \
+ "shrl.ph %[temp2], %[temp2], 10 \n\t" \
+ "shrl.ph %[temp3], %[temp3], 10 \n\t" \
+ "shll.ph %[temp0], %[temp0], 2 \n\t" \
+ "shll.ph %[temp1], %[temp1], 2 \n\t" \
+ "shll.ph %[temp2], %[temp2], 2 \n\t" \
+ "shll.ph %[temp3], %[temp3], 2 \n\t" \
+ "ext %[temp4], %[temp0], 0, 16 \n\t" \
+ "ext %[temp0], %[temp0], 16, 16 \n\t" \
+ "addu %[temp4], %[temp4], %[dist] \n\t" \
+ "addu %[temp0], %[temp0], %[dist] \n\t" \
+ "ext %[temp5], %[temp1], 0, 16 \n\t" \
+ "lw %[temp8], 0(%[temp4]) \n\t" \
+ "ext %[temp1], %[temp1], 16, 16 \n\t" \
+ "addu %[temp5], %[temp5], %[dist] \n\t" \
+ "addiu %[temp8], %[temp8], 1 \n\t" \
+ "sw %[temp8], 0(%[temp4]) \n\t" \
+ "lw %[temp8], 0(%[temp0]) \n\t" \
+ "addu %[temp1], %[temp1], %[dist] \n\t" \
+ "ext %[temp6], %[temp2], 0, 16 \n\t" \
+ "addiu %[temp8], %[temp8], 1 \n\t" \
+ "sw %[temp8], 0(%[temp0]) \n\t" \
+ "lw %[temp8], 0(%[temp5]) \n\t" \
+ "ext %[temp2], %[temp2], 16, 16 \n\t" \
+ "addu %[temp6], %[temp6], %[dist] \n\t" \
+ "addiu %[temp8], %[temp8], 1 \n\t" \
+ "sw %[temp8], 0(%[temp5]) \n\t" \
+ "lw %[temp8], 0(%[temp1]) \n\t" \
+ "addu %[temp2], %[temp2], %[dist] \n\t" \
+ "ext %[temp7], %[temp3], 0, 16 \n\t" \
+ "addiu %[temp8], %[temp8], 1 \n\t" \
+ "sw %[temp8], 0(%[temp1]) \n\t" \
+ "lw %[temp8], 0(%[temp6]) \n\t" \
+ "ext %[temp3], %[temp3], 16, 16 \n\t" \
+ "addu %[temp7], %[temp7], %[dist] \n\t" \
+ "addiu %[temp8], %[temp8], 1 \n\t" \
+ "sw %[temp8], 0(%[temp6]) \n\t" \
+ "lw %[temp8], 0(%[temp2]) \n\t" \
+ "addu %[temp3], %[temp3], %[dist] \n\t" \
+ "addiu %[temp8], %[temp8], 1 \n\t" \
+ "sw %[temp8], 0(%[temp2]) \n\t" \
+ "lw %[temp8], 0(%[temp7]) \n\t" \
+ "addiu %[temp8], %[temp8], 1 \n\t" \
+ "sw %[temp8], 0(%[temp7]) \n\t" \
+ "lw %[temp8], 0(%[temp3]) \n\t" \
+ "addiu %[temp8], %[temp8], 1 \n\t" \
+ "sw %[temp8], 0(%[temp3]) \n\t"
+
+static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
+ int j;
+ int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+ const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
+ for (j = start_block; j < end_block; ++j) {
+ int16_t out[16];
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+
+ VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+ // Convert coefficients to bin.
+ __asm__ volatile (
+ CONVERT_COEFFS_TO_BIN( 0, 4, 8, 12)
+ CONVERT_COEFFS_TO_BIN(16, 20, 24, 28)
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+ : [dist]"r"(distribution), [out]"r"(out), [max_coeff]"r"(max_coeff)
+ : "memory"
+ );
+ }
+ VP8SetHistogramData(distribution, histo);
+}
+
+#undef CONVERT_COEFFS_TO_BIN
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
+ VP8FTransform = FTransform_MIPSdspR2;
+ VP8FTransformWHT = FTransformWHT_MIPSdspR2;
+ VP8ITransform = ITransform_MIPSdspR2;
+
+ VP8TDisto4x4 = Disto4x4_MIPSdspR2;
+ VP8TDisto16x16 = Disto16x16_MIPSdspR2;
+
+ VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
+ VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
+ VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
+
+#if !defined(WORK_AROUND_GCC)
+ VP8SSE16x16 = SSE16x16_MIPSdspR2;
+ VP8SSE8x8 = SSE8x8_MIPSdspR2;
+ VP8SSE16x8 = SSE16x8_MIPSdspR2;
+ VP8SSE4x4 = SSE4x4_MIPSdspR2;
+#endif
+
+ VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
+ VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
+
+ VP8CollectHistogram = CollectHistogram_MIPSdspR2;
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/enc_msa.c b/media/libwebp/dsp/enc_msa.c
new file mode 100644
index 0000000000..229582e4a6
--- /dev/null
+++ b/media/libwebp/dsp/enc_msa.c
@@ -0,0 +1,896 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of encoder dsp functions.
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include <stdlib.h>
+#include "../dsp/msa_macro.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \
+ v4i32 a1_m, b1_m, c1_m, d1_m; \
+ const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \
+ const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \
+ v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \
+ v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \
+ v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \
+ v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \
+ \
+ ADDSUB2(in0, in2, a1_m, b1_m); \
+ SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \
+ c_tmp2_m = c_tmp2_m + in3; \
+ c1_m = c_tmp1_m - c_tmp2_m; \
+ SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \
+ d_tmp1_m = d_tmp1_m + in1; \
+ d1_m = d_tmp1_m + d_tmp2_m; \
+ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+} while (0)
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+ uint8_t* dst) {
+ v8i16 input0, input1;
+ v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+ v4i32 res0, res1, res2, res3;
+ v16i8 dest0, dest1, dest2, dest3;
+ const v16i8 zero = { 0 };
+
+ LD_SH2(in, 8, input0, input1);
+ UNPCK_SH_SW(input0, in0, in1);
+ UNPCK_SH_SW(input1, in2, in3);
+ IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+ TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+ IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+ SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+ TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+ LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
+ ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+ res0, res1, res2, res3);
+ ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+ res0, res1, res2, res3);
+ ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+ CLIP_SW4_0_255(res0, res1, res2, res3);
+ PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+ res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+ ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+ int do_two) {
+ ITransformOne(ref, in, dst);
+ if (do_two) {
+ ITransformOne(ref + 4, in + 16, dst + 4);
+ }
+}
+
+static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
+ uint64_t out0, out1, out2, out3;
+ uint32_t in0, in1, in2, in3;
+ v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ v8i16 t0, t1, t2, t3;
+ v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
+ const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+ const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+ const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
+ const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+ const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
+ const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
+
+ LW4(src, BPS, in0, in1, in2, in3);
+ INSERT_W4_UB(in0, in1, in2, in3, src0);
+ LW4(ref, BPS, in0, in1, in2, in3);
+ INSERT_W4_UB(in0, in1, in2, in3, src1);
+ ILVRL_B2_UB(src0, src1, srcl0, srcl1);
+ HSUB_UB2_SH(srcl0, srcl1, t0, t1);
+ VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+ ADDSUB2(t2, t3, t0, t1);
+ t0 = SRLI_H(t0, 3);
+ VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+ tmp0 = __msa_hadd_s_w(t3, t3);
+ tmp2 = __msa_hsub_s_w(t3, t3);
+ FILL_W2_SW(1812, 937, tmp1, tmp3);
+ DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+ SRAI_W2_SW(tmp1, tmp3, 9);
+ PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+ VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+ ADDSUB2(t2, t3, t0, t1);
+ VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+ tmp0 = __msa_hadd_s_w(t3, t3);
+ tmp2 = __msa_hsub_s_w(t3, t3);
+ ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
+ SRAI_W2_SW(tmp0, tmp2, 4);
+ FILL_W2_SW(12000, 51000, tmp1, tmp3);
+ DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+ SRAI_W2_SW(tmp1, tmp3, 16);
+ UNPCK_R_SH_SW(t1, tmp4);
+ tmp5 = __msa_ceqi_w(tmp4, 0);
+ tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
+ tmp5 = __msa_fill_w(1);
+ tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
+ tmp1 += tmp5;
+ PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+ out0 = __msa_copy_s_d((v2i64)t0, 0);
+ out1 = __msa_copy_s_d((v2i64)t0, 1);
+ out2 = __msa_copy_s_d((v2i64)t1, 0);
+ out3 = __msa_copy_s_d((v2i64)t1, 1);
+ SD4(out0, out1, out2, out3, out, 8);
+}
+
+static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
+ v8i16 in0 = { 0 };
+ v8i16 in1 = { 0 };
+ v8i16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0, out1;
+ const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+ const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+ const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+ const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+ in0 = __msa_insert_h(in0, 0, in[ 0]);
+ in0 = __msa_insert_h(in0, 1, in[ 64]);
+ in0 = __msa_insert_h(in0, 2, in[128]);
+ in0 = __msa_insert_h(in0, 3, in[192]);
+ in0 = __msa_insert_h(in0, 4, in[ 16]);
+ in0 = __msa_insert_h(in0, 5, in[ 80]);
+ in0 = __msa_insert_h(in0, 6, in[144]);
+ in0 = __msa_insert_h(in0, 7, in[208]);
+ in1 = __msa_insert_h(in1, 0, in[ 48]);
+ in1 = __msa_insert_h(in1, 1, in[112]);
+ in1 = __msa_insert_h(in1, 2, in[176]);
+ in1 = __msa_insert_h(in1, 3, in[240]);
+ in1 = __msa_insert_h(in1, 4, in[ 32]);
+ in1 = __msa_insert_h(in1, 5, in[ 96]);
+ in1 = __msa_insert_h(in1, 6, in[160]);
+ in1 = __msa_insert_h(in1, 7, in[224]);
+ ADDSUB2(in0, in1, tmp0, tmp1);
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+ ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+ ADDSUB2(in0, in1, tmp0, tmp1);
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+ ADDSUB2(tmp2, tmp3, out0, out1);
+ SRAI_H2_SH(out0, out1, 1);
+ ST_SH2(out0, out1, out, 8);
+}
+
+static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
+ int sum;
+ uint32_t in0_m, in1_m, in2_m, in3_m;
+ v16i8 src0 = { 0 };
+ v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
+ v4i32 dst0, dst1;
+ const v16i8 zero = { 0 };
+ const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+ const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+ const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+ const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+ LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
+ INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
+ ILVRL_B2_SH(zero, src0, tmp0, tmp1);
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+ ADDSUB2(in0, in1, tmp0, tmp1);
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+ ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+ ADDSUB2(in0, in1, tmp0, tmp1);
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+ ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+ tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
+ tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
+ LD_SH2(w, 8, tmp2, tmp3);
+ DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
+ dst0 = dst0 + dst1;
+ sum = HADD_SW_S32(dst0);
+ return sum;
+}
+
+static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ const int sum1 = TTransform_MSA(a, w);
+ const int sum2 = TTransform_MSA(b, w);
+ return abs(sum2 - sum1) >> 5;
+}
+
+static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ int D = 0;
+ int x, y;
+ for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+ for (x = 0; x < 16; x += 4) {
+ D += Disto4x4_MSA(a + x + y, b + x + y, w);
+ }
+ }
+ return D;
+}
+
+//------------------------------------------------------------------------------
+// Histogram
+
+static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
+ int j;
+ int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+ for (j = start_block; j < end_block; ++j) {
+ int16_t out[16];
+ VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+ {
+ int k;
+ v8i16 coeff0, coeff1;
+ const v8i16 zero = { 0 };
+ const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
+ LD_SH2(&out[0], 8, coeff0, coeff1);
+ coeff0 = __msa_add_a_h(coeff0, zero);
+ coeff1 = __msa_add_a_h(coeff1, zero);
+ SRAI_H2_SH(coeff0, coeff1, 3);
+ coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
+ coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
+ ST_SH2(coeff0, coeff1, &out[0], 8);
+ for (k = 0; k < 16; ++k) {
+ ++distribution[out[k]];
+ }
+ }
+ }
+ VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// luma 4x4 prediction
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
+ const v16u8 A1 = { 0 };
+ const uint64_t val_m = LD(top - 1);
+ const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+ const v16u8 B = SLDI_UB(A, A, 1);
+ const v16u8 C = SLDI_UB(A, A, 2);
+ const v16u8 AC = __msa_ave_u_b(A, C);
+ const v16u8 B2 = __msa_ave_u_b(B, B);
+ const v16u8 R = __msa_aver_u_b(AC, B2);
+ const uint32_t out = __msa_copy_s_w((v4i32)R, 0);
+ SW4(out, out, out, out, dst, BPS);
+}
+
+static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+ WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+ WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+ WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+ uint32_t dc = 4;
+ int i;
+ for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+ dc >>= 3;
+ dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+ SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+ const v16u8 A2 = { 0 };
+ const uint64_t val_m = LD(top - 5);
+ const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
+ const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
+ const v16u8 B = SLDI_UB(A, A, 1);
+ const v16u8 C = SLDI_UB(A, A, 2);
+ const v16u8 AC = __msa_ave_u_b(A, C);
+ const v16u8 B2 = __msa_ave_u_b(B, B);
+ const v16u8 R0 = __msa_aver_u_b(AC, B2);
+ const v16u8 R1 = SLDI_UB(R0, R0, 1);
+ const v16u8 R2 = SLDI_UB(R1, R1, 1);
+ const v16u8 R3 = SLDI_UB(R2, R2, 1);
+ const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+ const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+ const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+ const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+ SW4(val3, val2, val1, val0, dst, BPS);
+}
+
+static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+ const v16u8 A1 = { 0 };
+ const uint64_t val_m = LD(top);
+ const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+ const v16u8 B = SLDI_UB(A, A, 1);
+ const v16u8 C1 = SLDI_UB(A, A, 2);
+ const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
+ const v16u8 AC = __msa_ave_u_b(A, C);
+ const v16u8 B2 = __msa_ave_u_b(B, B);
+ const v16u8 R0 = __msa_aver_u_b(AC, B2);
+ const v16u8 R1 = SLDI_UB(R0, R0, 1);
+ const v16u8 R2 = SLDI_UB(R1, R1, 1);
+ const v16u8 R3 = SLDI_UB(R2, R2, 1);
+ const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+ const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+ const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+ const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+ SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+ const int D = top[3];
+ DST(0, 0) = DST(1, 2) = AVG2(X, A);
+ DST(1, 0) = DST(2, 2) = AVG2(A, B);
+ DST(2, 0) = DST(3, 2) = AVG2(B, C);
+ DST(3, 0) = AVG2(C, D);
+ DST(0, 3) = AVG3(K, J, I);
+ DST(0, 2) = AVG3(J, I, X);
+ DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+ DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+ DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+ DST(3, 1) = AVG3(B, C, D);
+}
+
+static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+ const int D = top[3];
+ const int E = top[4];
+ const int F = top[5];
+ const int G = top[6];
+ const int H = top[7];
+ DST(0, 0) = AVG2(A, B);
+ DST(1, 0) = DST(0, 2) = AVG2(B, C);
+ DST(2, 0) = DST(1, 2) = AVG2(C, D);
+ DST(3, 0) = DST(2, 2) = AVG2(D, E);
+ DST(0, 1) = AVG3(A, B, C);
+ DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+ DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+ DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+ DST(3, 2) = AVG3(E, F, G);
+ DST(3, 3) = AVG3(F, G, H);
+}
+
+static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ DST(0, 0) = AVG2(I, J);
+ DST(2, 0) = DST(0, 1) = AVG2(J, K);
+ DST(2, 1) = DST(0, 2) = AVG2(K, L);
+ DST(1, 0) = AVG3(I, J, K);
+ DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+ DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+ DST(3, 2) = DST(2, 2) =
+ DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+ DST(0, 0) = DST(2, 1) = AVG2(I, X);
+ DST(0, 1) = DST(2, 2) = AVG2(J, I);
+ DST(0, 2) = DST(2, 3) = AVG2(K, J);
+ DST(0, 3) = AVG2(L, K);
+ DST(3, 0) = AVG3(A, B, C);
+ DST(2, 0) = AVG3(X, A, B);
+ DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+ DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+ DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+ DST(1, 3) = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+ const v16i8 zero = { 0 };
+ const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
+ const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
+ const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
+ const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
+ const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
+ const v16u8 T1 = LD_UB(top);
+ const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+ const v8i16 d = T - TL;
+ v8i16 r0, r1, r2, r3;
+ ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+ CLIP_SH4_0_255(r0, r1, r2, r3);
+ PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
+ DC4(I4DC4 + dst, top);
+ TM4(I4TM4 + dst, top);
+ VE4(I4VE4 + dst, top);
+ HE4(I4HE4 + dst, top);
+ RD4(I4RD4 + dst, top);
+ VR4(I4VR4 + dst, top);
+ LD4(I4LD4 + dst, top);
+ VL4(I4VL4 + dst, top);
+ HD4(I4HD4 + dst, top);
+ HU4(I4HU4 + dst, top);
+}
+
+// luma 16x16 prediction
+
+#define STORE16x16(out, dst) do { \
+ ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS); \
+ ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \
+} while (0)
+
+static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
+ if (top != NULL) {
+ const v16u8 out = LD_UB(top);
+ STORE16x16(out, dst);
+ } else {
+ const v16u8 out = (v16u8)__msa_fill_b(0x7f);
+ STORE16x16(out, dst);
+ }
+}
+
+static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
+ const uint8_t* left) {
+ if (left != NULL) {
+ int j;
+ for (j = 0; j < 16; j += 4) {
+ const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+ const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+ const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+ const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+ ST_UB4(L0, L1, L2, L3, dst, BPS);
+ dst += 4 * BPS;
+ left += 4;
+ }
+ } else {
+ const v16u8 out = (v16u8)__msa_fill_b(0x81);
+ STORE16x16(out, dst);
+ }
+}
+
+static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ if (left != NULL) {
+ if (top != NULL) {
+ int j;
+ v8i16 d1, d2;
+ const v16i8 zero = { 0 };
+ const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+ const v16u8 T = LD_UB(top);
+ ILVRL_B2_SH(zero, T, d1, d2);
+ SUB2(d1, TL, d2, TL, d1, d2);
+ for (j = 0; j < 16; j += 4) {
+ v16i8 t0, t1, t2, t3;
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+ const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
+ const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
+ const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
+ const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
+ ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+ ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+ CLIP_SH4_0_255(r0, r1, r2, r3);
+ CLIP_SH4_0_255(r4, r5, r6, r7);
+ PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+ ST_SB4(t0, t1, t2, t3, dst, BPS);
+ dst += 4 * BPS;
+ }
+ } else {
+ HorizontalPred16x16(dst, left);
+ }
+ } else {
+ if (top != NULL) {
+ VerticalPred16x16(dst, top);
+ } else {
+ const v16u8 out = (v16u8)__msa_fill_b(0x81);
+ STORE16x16(out, dst);
+ }
+ }
+}
+
+static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ int DC;
+ v16u8 out;
+ if (top != NULL && left != NULL) {
+ const v16u8 rtop = LD_UB(top);
+ const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+ const v16u8 rleft = LD_UB(left);
+ const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+ const v8u16 dctemp = dctop + dcleft;
+ DC = HADD_UH_U32(dctemp);
+ DC = (DC + 16) >> 5;
+ } else if (left != NULL) { // left but no top
+ const v16u8 rleft = LD_UB(left);
+ const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+ DC = HADD_UH_U32(dcleft);
+ DC = (DC + DC + 16) >> 5;
+ } else if (top != NULL) { // top but no left
+ const v16u8 rtop = LD_UB(top);
+ const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+ DC = HADD_UH_U32(dctop);
+ DC = (DC + DC + 16) >> 5;
+ } else { // no top, no left, nothing.
+ DC = 0x80;
+ }
+ out = (v16u8)__msa_fill_b(DC);
+ STORE16x16(out, dst);
+}
+
+static void Intra16Preds_MSA(uint8_t* dst,
+ const uint8_t* left, const uint8_t* top) {
+ DCMode16x16(I16DC16 + dst, left, top);
+ VerticalPred16x16(I16VE16 + dst, top);
+ HorizontalPred16x16(I16HE16 + dst, left);
+ TrueMotion16x16(I16TM16 + dst, left, top);
+}
+
+// Chroma 8x8 prediction
+
+#define CALC_DC8(in, out) do { \
+ const v8u16 temp0 = __msa_hadd_u_h(in, in); \
+ const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0); \
+ const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1); \
+ const v2i64 temp3 = __msa_splati_d(temp2, 1); \
+ const v2i64 temp4 = temp3 + temp2; \
+ const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4); \
+ const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0); \
+ out = __msa_copy_s_d(temp6, 0); \
+} while (0)
+
+#define STORE8x8(out, dst) do { \
+ SD4(out, out, out, out, dst + 0 * BPS, BPS); \
+ SD4(out, out, out, out, dst + 4 * BPS, BPS); \
+} while (0)
+
+static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
+ if (top != NULL) {
+ const uint64_t out = LD(top);
+ STORE8x8(out, dst);
+ } else {
+ const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;
+ STORE8x8(out, dst);
+ }
+}
+
+static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
+ if (left != NULL) {
+ int j;
+ for (j = 0; j < 8; j += 4) {
+ const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+ const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+ const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+ const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+ const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+ const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+ const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+ const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+ SD4(out0, out1, out2, out3, dst, BPS);
+ dst += 4 * BPS;
+ left += 4;
+ }
+ } else {
+ const uint64_t out = 0x8181818181818181ULL;
+ STORE8x8(out, dst);
+ }
+}
+
+static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ if (left != NULL) {
+ if (top != NULL) {
+ int j;
+ const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+ const v16u8 T1 = LD_UB(top);
+ const v16i8 zero = { 0 };
+ const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+ const v8i16 d = T - TL;
+ for (j = 0; j < 8; j += 4) {
+ uint64_t out0, out1, out2, out3;
+ v16i8 t0, t1;
+ v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);
+ v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);
+ v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);
+ v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);
+ ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+ CLIP_SH4_0_255(r0, r1, r2, r3);
+ PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+ out0 = __msa_copy_s_d((v2i64)t0, 0);
+ out1 = __msa_copy_s_d((v2i64)t0, 1);
+ out2 = __msa_copy_s_d((v2i64)t1, 0);
+ out3 = __msa_copy_s_d((v2i64)t1, 1);
+ SD4(out0, out1, out2, out3, dst, BPS);
+ dst += 4 * BPS;
+ }
+ } else {
+ HorizontalPred8x8(dst, left);
+ }
+ } else {
+ if (top != NULL) {
+ VerticalPred8x8(dst, top);
+ } else {
+ const uint64_t out = 0x8181818181818181ULL;
+ STORE8x8(out, dst);
+ }
+ }
+}
+
+static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ uint64_t out;
+ v16u8 src = { 0 };
+ if (top != NULL && left != NULL) {
+ const uint64_t left_m = LD(left);
+ const uint64_t top_m = LD(top);
+ INSERT_D2_UB(left_m, top_m, src);
+ CALC_DC8(src, out);
+ } else if (left != NULL) { // left but no top
+ const uint64_t left_m = LD(left);
+ INSERT_D2_UB(left_m, left_m, src);
+ CALC_DC8(src, out);
+ } else if (top != NULL) { // top but no left
+ const uint64_t top_m = LD(top);
+ INSERT_D2_UB(top_m, top_m, src);
+ CALC_DC8(src, out);
+ } else { // no top, no left, nothing.
+ src = (v16u8)__msa_fill_b(0x80);
+ out = __msa_copy_s_d((v2i64)src, 0);
+ }
+ STORE8x8(out, dst);
+}
+
+static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ // U block
+ DCMode8x8(C8DC8 + dst, left, top);
+ VerticalPred8x8(C8VE8 + dst, top);
+ HorizontalPred8x8(C8HE8 + dst, left);
+ TrueMotion8x8(C8TM8 + dst, left, top);
+ // V block
+ dst += 8;
+ if (top != NULL) top += 8;
+ if (left != NULL) left += 16;
+ DCMode8x8(C8DC8 + dst, left, top);
+ VerticalPred8x8(C8VE8 + dst, top);
+ HorizontalPred8x8(C8HE8 + dst, left);
+ TrueMotion8x8(C8TM8 + dst, left, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
+ v16u8 tmp0, tmp1; \
+ v8i16 tmp2, tmp3; \
+ ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
+ HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
+ DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
+ ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
+ HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
+ DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
+} while (0)
+
+#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
+ v16u8 tmp0, tmp1; \
+ v8i16 tmp2, tmp3; \
+ ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
+ HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
+ DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
+ ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
+ HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
+ DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
+} while (0)
+
+static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
+ uint32_t sum;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v4i32 out0, out1, out2, out3;
+
+ LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+ PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+ PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+ PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+ a += 8 * BPS;
+ b += 8 * BPS;
+ LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+ PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+ PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+ PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+ out0 += out1;
+ out2 += out3;
+ out0 += out2;
+ sum = HADD_SW_S32(out0);
+ return sum;
+}
+
+static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
+ uint32_t sum;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v4i32 out0, out1, out2, out3;
+
+ LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+ PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+ PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+ PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+ out0 += out1;
+ out2 += out3;
+ out0 += out2;
+ sum = HADD_SW_S32(out0);
+ return sum;
+}
+
+static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
+ uint32_t sum;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v16u8 t0, t1, t2, t3;
+ v4i32 out0, out1, out2, out3;
+
+ LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
+ PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+ ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
+ PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+ out0 += out1;
+ out2 += out3;
+ out0 += out2;
+ sum = HADD_SW_S32(out0);
+ return sum;
+}
+
+static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
+ uint32_t sum = 0;
+ uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
+ v8i16 diff0, diff1;
+ v4i32 out0, out1;
+
+ LW4(a, BPS, src0, src1, src2, src3);
+ LW4(b, BPS, ref0, ref1, ref2, ref3);
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ILVRL_B2_UB(src, ref, tmp0, tmp1);
+ HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
+ DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
+ out0 += out1;
+ sum = HADD_SW_S32(out0);
+ return sum;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+
+static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ int sum;
+ v8i16 in0, in1, sh0, sh1, out0, out1;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
+ v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
+ const v8i16 zero = { 0 };
+ const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+ const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+ const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
+
+ LD_SH2(&in[0], 8, in0, in1);
+ LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1);
+ tmp4 = __msa_add_a_h(in0, zero);
+ tmp5 = __msa_add_a_h(in1, zero);
+ ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
+ ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
+ HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
+ sign0 = (in0 < zero);
+ sign1 = (in1 < zero); // sign
+ LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1); // iq
+ ILVRL_H2_SW(zero, tmp0, t0, t1);
+ ILVRL_H2_SW(zero, tmp1, t2, t3);
+ LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3); // bias
+ MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
+ ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
+ SRAI_W4_SW(b0, b1, b2, b3, 17);
+ PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
+ tmp0 = (tmp2 > maxlevel);
+ tmp1 = (tmp3 > maxlevel);
+ tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
+ tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
+ SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
+ tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
+ tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
+ LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh
+ t0 = (s0 > t0);
+ t1 = (s1 > t1);
+ t2 = (s2 > t2);
+ t3 = (s3 > t3);
+ PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
+ tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
+ tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
+ LD_SH2(&mtx->q_[0], 8, tmp0, tmp1);
+ MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
+ VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
+ ST_SH2(in0, in1, &in[0], 8);
+ ST_SH2(out0, out1, &out[0], 8);
+ out0 = __msa_add_a_h(out0, out1);
+ sum = HADD_SH_S32(out0);
+ return (sum > 0);
+}
+
+static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
+ int nz;
+ nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+ nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+ return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
+ VP8ITransform = ITransform_MSA;
+ VP8FTransform = FTransform_MSA;
+ VP8FTransformWHT = FTransformWHT_MSA;
+
+ VP8TDisto4x4 = Disto4x4_MSA;
+ VP8TDisto16x16 = Disto16x16_MSA;
+ VP8CollectHistogram = CollectHistogram_MSA;
+
+ VP8EncPredLuma4 = Intra4Preds_MSA;
+ VP8EncPredLuma16 = Intra16Preds_MSA;
+ VP8EncPredChroma8 = IntraChromaPreds_MSA;
+
+ VP8SSE16x16 = SSE16x16_MSA;
+ VP8SSE16x8 = SSE16x8_MSA;
+ VP8SSE8x8 = SSE8x8_MSA;
+ VP8SSE4x4 = SSE4x4_MSA;
+
+ VP8EncQuantizeBlock = QuantizeBlock_MSA;
+ VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
+ VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
+}
+
+#else // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
+
+#endif // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/enc_neon.c b/media/libwebp/dsp/enc_neon.c
new file mode 100644
index 0000000000..657be9b21b
--- /dev/null
+++ b/media/libwebp/dsp/enc_neon.c
@@ -0,0 +1,938 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of speed-critical encoding functions.
+//
+// adapted from libvpx (https://www.webmproject.org/code/)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+
+#include "../dsp/neon.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Inverse transform.
+// This code is pretty much the same as TransformOne in the dec_neon.c, except
+// for subtraction to *ref. See the comments there for algorithmic explanations.
+
+static const int16_t kC1 = 20091;
+static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
+
+// This code works but is *slower* than the inlined-asm version below
+// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
+// WEBP_USE_INTRINSICS define.
+// With gcc-4.8, it's a little faster speed than inlined-assembly.
+#if defined(WEBP_USE_INTRINSICS)
+
+// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
+ return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
+}
+
+// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
+// to the corresponding rows of 'dst'.
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+ const int16x8_t dst01,
+ const int16x8_t dst23) {
+ // Unsigned saturate to 8b.
+ const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
+ const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
+
+ // Store the results.
+ vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
+ vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
+ vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
+ vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
+}
+
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+ const int16x8_t row23,
+ const uint8_t* const ref,
+ uint8_t* const dst) {
+ uint32x2_t dst01 = vdup_n_u32(0);
+ uint32x2_t dst23 = vdup_n_u32(0);
+
+ // Load the source pixels.
+ dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
+ dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
+ dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
+ dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
+
+ {
+ // Convert to 16b.
+ const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
+ const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
+
+ // Descale with rounding.
+ const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
+ const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
+ // Add the inverse transform.
+ SaturateAndStore4x4_NEON(dst, out01, out23);
+ }
+}
+
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+ const int16x8_t in1,
+ int16x8x2_t* const out) {
+ // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
+ // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
+ const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
+ // b0 d0 b1 d1 b2 d2 ...
+ *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
+}
+
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
+ // {rows} = in0 | in4
+ // in8 | in12
+ // B1 = in4 | in12
+ const int16x8_t B1 =
+ vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
+ // C0 = kC1 * in4 | kC1 * in12
+ // C1 = kC2 * in4 | kC2 * in12
+ const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
+ const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
+ const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
+ vget_low_s16(rows->val[1])); // in0 + in8
+ const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
+ vget_low_s16(rows->val[1])); // in0 - in8
+ // c = kC2 * in4 - kC1 * in12
+ // d = kC1 * in4 + kC2 * in12
+ const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
+ const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
+ const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
+ const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
+ const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
+ const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
+ const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
+ Transpose8x2_NEON(E0, E1, rows);
+}
+
+static void ITransformOne_NEON(const uint8_t* ref,
+ const int16_t* in, uint8_t* dst) {
+ int16x8x2_t rows;
+ INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
+ TransformPass_NEON(&rows);
+ TransformPass_NEON(&rows);
+ Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
+}
+
+#else
+
+static void ITransformOne_NEON(const uint8_t* ref,
+ const int16_t* in, uint8_t* dst) {
+ const int kBPS = BPS;
+ const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
+
+ __asm__ volatile (
+ "vld1.16 {q1, q2}, [%[in]] \n"
+ "vld1.16 {d0}, [%[kC1C2]] \n"
+
+ // d2: in[0]
+ // d3: in[8]
+ // d4: in[4]
+ // d5: in[12]
+ "vswp d3, d4 \n"
+
+ // q8 = {in[4], in[12]} * kC1 * 2 >> 16
+ // q9 = {in[4], in[12]} * kC2 >> 16
+ "vqdmulh.s16 q8, q2, d0[0] \n"
+ "vqdmulh.s16 q9, q2, d0[1] \n"
+
+ // d22 = a = in[0] + in[8]
+ // d23 = b = in[0] - in[8]
+ "vqadd.s16 d22, d2, d3 \n"
+ "vqsub.s16 d23, d2, d3 \n"
+
+ // q8 = in[4]/[12] * kC1 >> 16
+ "vshr.s16 q8, q8, #1 \n"
+
+ // Add {in[4], in[12]} back after the multiplication.
+ "vqadd.s16 q8, q2, q8 \n"
+
+ // d20 = c = in[4]*kC2 - in[12]*kC1
+ // d21 = d = in[4]*kC1 + in[12]*kC2
+ "vqsub.s16 d20, d18, d17 \n"
+ "vqadd.s16 d21, d19, d16 \n"
+
+ // d2 = tmp[0] = a + d
+ // d3 = tmp[1] = b + c
+ // d4 = tmp[2] = b - c
+ // d5 = tmp[3] = a - d
+ "vqadd.s16 d2, d22, d21 \n"
+ "vqadd.s16 d3, d23, d20 \n"
+ "vqsub.s16 d4, d23, d20 \n"
+ "vqsub.s16 d5, d22, d21 \n"
+
+ "vzip.16 q1, q2 \n"
+ "vzip.16 q1, q2 \n"
+
+ "vswp d3, d4 \n"
+
+ // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
+ // q9 = {tmp[4], tmp[12]} * kC2 >> 16
+ "vqdmulh.s16 q8, q2, d0[0] \n"
+ "vqdmulh.s16 q9, q2, d0[1] \n"
+
+ // d22 = a = tmp[0] + tmp[8]
+ // d23 = b = tmp[0] - tmp[8]
+ "vqadd.s16 d22, d2, d3 \n"
+ "vqsub.s16 d23, d2, d3 \n"
+
+ "vshr.s16 q8, q8, #1 \n"
+ "vqadd.s16 q8, q2, q8 \n"
+
+ // d20 = c = in[4]*kC2 - in[12]*kC1
+ // d21 = d = in[4]*kC1 + in[12]*kC2
+ "vqsub.s16 d20, d18, d17 \n"
+ "vqadd.s16 d21, d19, d16 \n"
+
+ // d2 = tmp[0] = a + d
+ // d3 = tmp[1] = b + c
+ // d4 = tmp[2] = b - c
+ // d5 = tmp[3] = a - d
+ "vqadd.s16 d2, d22, d21 \n"
+ "vqadd.s16 d3, d23, d20 \n"
+ "vqsub.s16 d4, d23, d20 \n"
+ "vqsub.s16 d5, d22, d21 \n"
+
+ "vld1.32 d6[0], [%[ref]], %[kBPS] \n"
+ "vld1.32 d6[1], [%[ref]], %[kBPS] \n"
+ "vld1.32 d7[0], [%[ref]], %[kBPS] \n"
+ "vld1.32 d7[1], [%[ref]], %[kBPS] \n"
+
+ "sub %[ref], %[ref], %[kBPS], lsl #2 \n"
+
+ // (val) + 4 >> 3
+ "vrshr.s16 d2, d2, #3 \n"
+ "vrshr.s16 d3, d3, #3 \n"
+ "vrshr.s16 d4, d4, #3 \n"
+ "vrshr.s16 d5, d5, #3 \n"
+
+ "vzip.16 q1, q2 \n"
+ "vzip.16 q1, q2 \n"
+
+ // Must accumulate before saturating
+ "vmovl.u8 q8, d6 \n"
+ "vmovl.u8 q9, d7 \n"
+
+ "vqadd.s16 q1, q1, q8 \n"
+ "vqadd.s16 q2, q2, q9 \n"
+
+ "vqmovun.s16 d0, q1 \n"
+ "vqmovun.s16 d1, q2 \n"
+
+ "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
+ "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
+ "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
+ "vst1.32 d1[1], [%[dst]] \n"
+
+ : [in] "+r"(in), [dst] "+r"(dst) // modified registers
+ : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
+ : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
+ );
+}
+
+#endif // WEBP_USE_INTRINSICS
+
+static void ITransform_NEON(const uint8_t* ref,
+ const int16_t* in, uint8_t* dst, int do_two) {
+ ITransformOne_NEON(ref, in, dst);
+ if (do_two) {
+ ITransformOne_NEON(ref + 4, in + 16, dst + 4);
+ }
+}
+
+// Load all 4x4 pixels into a single uint8x16_t variable.
+static uint8x16_t Load4x4_NEON(const uint8_t* src) {
+ uint32x4_t out = vdupq_n_u32(0);
+ out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
+ out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
+ out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
+ out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
+ return vreinterpretq_u8_u32(out);
+}
+
+// Forward transform.
+
+#if defined(WEBP_USE_INTRINSICS)
+
+static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
+ const int16x4_t B,
+ const int16x4_t C,
+ const int16x4_t D,
+ int16x8_t* const out01,
+ int16x8_t* const out32) {
+ const int16x4x2_t AB = vtrn_s16(A, B);
+ const int16x4x2_t CD = vtrn_s16(C, D);
+ const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
+ vreinterpret_s32_s16(CD.val[0]));
+ const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
+ vreinterpret_s32_s16(CD.val[1]));
+ *out01 = vreinterpretq_s16_s64(
+ vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
+ vreinterpret_s64_s32(tmp13.val[0])));
+ *out32 = vreinterpretq_s16_s64(
+ vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
+ vreinterpret_s64_s32(tmp02.val[1])));
+}
+
+static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
+ const uint8x8_t b) {
+ return vreinterpretq_s16_u16(vsubl_u8(a, b));
+}
+
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
+ int16x8_t d0d1, d3d2; // working 4x4 int16 variables
+ {
+ const uint8x16_t S0 = Load4x4_NEON(src);
+ const uint8x16_t R0 = Load4x4_NEON(ref);
+ const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
+ const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
+ const int16x4_t D0 = vget_low_s16(D0D1);
+ const int16x4_t D1 = vget_high_s16(D0D1);
+ const int16x4_t D2 = vget_low_s16(D2D3);
+ const int16x4_t D3 = vget_high_s16(D2D3);
+ Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
+ }
+ { // 1rst pass
+ const int32x4_t kCst937 = vdupq_n_s32(937);
+ const int32x4_t kCst1812 = vdupq_n_s32(1812);
+ const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
+ const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
+ const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
+ const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
+ vget_high_s16(a0a1_2));
+ const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
+ vget_high_s16(a0a1_2));
+ const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+ const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+ const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+ const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+ const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
+ const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
+ Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+ }
+ { // 2nd pass
+ // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
+ const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
+ const int32x4_t kCst51000 = vdupq_n_s32(51000);
+ const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
+ const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
+ const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
+ const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
+ const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
+ const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+ const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+ const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+ const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+ const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
+ const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
+ const int16x4_t a3_eq_0 =
+ vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
+ const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
+ vst1_s16(out + 0, out0);
+ vst1_s16(out + 4, out1);
+ vst1_s16(out + 8, out2);
+ vst1_s16(out + 12, out3);
+ }
+}
+
+#else
+
+// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
+static const int16_t kCoeff16[] = {
+ 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
+};
+static const int32_t kCoeff32[] = {
+ 1812, 1812, 1812, 1812,
+ 937, 937, 937, 937,
+ 12000, 12000, 12000, 12000,
+ 51000, 51000, 51000, 51000
+};
+
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
+ const int kBPS = BPS;
+ const uint8_t* src_ptr = src;
+ const uint8_t* ref_ptr = ref;
+ const int16_t* coeff16 = kCoeff16;
+ const int32_t* coeff32 = kCoeff32;
+
+ __asm__ volatile (
+ // load src into q4, q5 in high half
+ "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
+ "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
+ "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
+ "vld1.8 {d11}, [%[src_ptr]] \n"
+
+ // load ref into q6, q7 in high half
+ "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
+ "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
+ "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
+ "vld1.8 {d15}, [%[ref_ptr]] \n"
+
+ // Pack the high values in to q4 and q6
+ "vtrn.32 q4, q5 \n"
+ "vtrn.32 q6, q7 \n"
+
+ // d[0-3] = src - ref
+ "vsubl.u8 q0, d8, d12 \n"
+ "vsubl.u8 q1, d9, d13 \n"
+
+ // load coeff16 into q8(d16=5352, d17=2217)
+ "vld1.16 {q8}, [%[coeff16]] \n"
+
+ // load coeff32 high half into q9 = 1812, q10 = 937
+ "vld1.32 {q9, q10}, [%[coeff32]]! \n"
+
+ // load coeff32 low half into q11=12000, q12=51000
+ "vld1.32 {q11,q12}, [%[coeff32]] \n"
+
+ // part 1
+ // Transpose. Register dN is the same as dN in C
+ "vtrn.32 d0, d2 \n"
+ "vtrn.32 d1, d3 \n"
+ "vtrn.16 d0, d1 \n"
+ "vtrn.16 d2, d3 \n"
+
+ "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
+ "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
+ "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
+ "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
+
+ "vadd.s16 d0, d4, d5 \n" // a0 + a1
+ "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
+ "vsub.s16 d2, d4, d5 \n" // a0 - a1
+ "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
+
+ "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
+ "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
+ "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
+ "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
+
+ // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
+ // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
+ "vshrn.s32 d1, q9, #9 \n"
+ "vshrn.s32 d3, q10, #9 \n"
+
+ // part 2
+ // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+ "vtrn.32 d0, d2 \n"
+ "vtrn.32 d1, d3 \n"
+ "vtrn.16 d0, d1 \n"
+ "vtrn.16 d2, d3 \n"
+
+ "vmov.s16 d26, #7 \n"
+
+ "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
+ "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
+ "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
+ "vadd.s16 d4, d4, d26 \n" // a1 + 7
+ "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
+
+ "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
+ "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
+
+ "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
+ "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
+
+ "vceq.s16 d4, d7, #0 \n"
+
+ "vshr.s16 d0, d0, #4 \n"
+ "vshr.s16 d2, d2, #4 \n"
+
+ "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
+ "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
+
+ "vmvn d4, d4 \n" // !(d1 == 0)
+ // op[4] = (c1*2217 + d1*5352 + 12000)>>16
+ "vshrn.s32 d1, q11, #16 \n"
+ // op[4] += (d1!=0)
+ "vsub.s16 d1, d1, d4 \n"
+ // op[12]= (d1*2217 - c1*5352 + 51000)>>16
+ "vshrn.s32 d3, q12, #16 \n"
+
+ // set result to out array
+ "vst1.16 {q0, q1}, [%[out]] \n"
+ : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
+ [coeff32] "+r"(coeff32) // modified registers
+ : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
+ [out] "r"(out) // constants
+ : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13" // clobbered
+ );
+}
+
+#endif
+
+#define LOAD_LANE_16b(VALUE, LANE) do { \
+ (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
+ src += stride; \
+} while (0)
+
+static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
+ const int stride = 16;
+ const int16x4_t zero = vdup_n_s16(0);
+ int32x4x4_t tmp0;
+ int16x4x4_t in;
+ INIT_VECTOR4(in, zero, zero, zero, zero);
+ LOAD_LANE_16b(in.val[0], 0);
+ LOAD_LANE_16b(in.val[1], 0);
+ LOAD_LANE_16b(in.val[2], 0);
+ LOAD_LANE_16b(in.val[3], 0);
+ LOAD_LANE_16b(in.val[0], 1);
+ LOAD_LANE_16b(in.val[1], 1);
+ LOAD_LANE_16b(in.val[2], 1);
+ LOAD_LANE_16b(in.val[3], 1);
+ LOAD_LANE_16b(in.val[0], 2);
+ LOAD_LANE_16b(in.val[1], 2);
+ LOAD_LANE_16b(in.val[2], 2);
+ LOAD_LANE_16b(in.val[3], 2);
+ LOAD_LANE_16b(in.val[0], 3);
+ LOAD_LANE_16b(in.val[1], 3);
+ LOAD_LANE_16b(in.val[2], 3);
+ LOAD_LANE_16b(in.val[3], 3);
+
+ {
+ // a0 = in[0 * 16] + in[2 * 16]
+ // a1 = in[1 * 16] + in[3 * 16]
+ // a2 = in[1 * 16] - in[3 * 16]
+ // a3 = in[0 * 16] - in[2 * 16]
+ const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
+ const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
+ const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
+ const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
+ tmp0.val[0] = vaddq_s32(a0, a1);
+ tmp0.val[1] = vaddq_s32(a3, a2);
+ tmp0.val[2] = vsubq_s32(a3, a2);
+ tmp0.val[3] = vsubq_s32(a0, a1);
+ }
+ {
+ const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
+ // a0 = tmp[0 + i] + tmp[ 8 + i]
+ // a1 = tmp[4 + i] + tmp[12 + i]
+ // a2 = tmp[4 + i] - tmp[12 + i]
+ // a3 = tmp[0 + i] - tmp[ 8 + i]
+ const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
+ const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
+ const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
+ const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
+ const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
+ const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
+ const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
+ const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
+ const int16x4_t out0 = vmovn_s32(b0);
+ const int16x4_t out1 = vmovn_s32(b1);
+ const int16x4_t out2 = vmovn_s32(b2);
+ const int16x4_t out3 = vmovn_s32(b3);
+
+ vst1_s16(out + 0, out0);
+ vst1_s16(out + 4, out1);
+ vst1_s16(out + 8, out2);
+ vst1_s16(out + 12, out3);
+ }
+}
+#undef LOAD_LANE_16b
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// a 0123, b 0123
+// a 4567, b 4567
+// a 89ab, b 89ab
+// a cdef, b cdef
+//
+// transpose
+//
+// a 048c, b 048c
+// a 159d, b 159d
+// a 26ae, b 26ae
+// a 37bf, b 37bf
+//
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
+ const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
+ const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
+ const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
+ vreinterpretq_s32_s16(q2_tmp1.val[0]));
+ const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
+ vreinterpretq_s32_s16(q2_tmp1.val[1]));
+ q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
+ q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
+ q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
+ q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
+ return q4_in;
+}
+
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
+ const int16x8x4_t q4_in) {
+ // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
+ // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
+ const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
+ const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
+ const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
+ const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
+ int16x8x4_t q4_out;
+ // tmp[0] = a0 + a1
+ // tmp[1] = a3 + a2
+ // tmp[2] = a3 - a2
+ // tmp[3] = a0 - a1
+ INIT_VECTOR4(q4_out,
+ vabsq_s16(vaddq_s16(q_a0, q_a1)),
+ vabsq_s16(vaddq_s16(q_a3, q_a2)),
+ vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
+ return q4_out;
+}
+
+static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
+ const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
+ q4_in.val[2]));
+ const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
+ q4_in.val[3]));
+ const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
+ q4_in.val[3]));
+ const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
+ q4_in.val[2]));
+ int16x8x4_t q4_out;
+
+ INIT_VECTOR4(q4_out,
+ vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
+ vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
+ return q4_out;
+}
+
+static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
+ const uint16x8_t q_w07 = vld1q_u16(&w[0]);
+ const uint16x8_t q_w8f = vld1q_u16(&w[8]);
+ int16x4x4_t d4_w;
+ INIT_VECTOR4(d4_w,
+ vget_low_s16(vreinterpretq_s16_u16(q_w07)),
+ vget_high_s16(vreinterpretq_s16_u16(q_w07)),
+ vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
+ vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
+ return d4_w;
+}
+
+static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
+ const int16x4x4_t d4_w) {
+ int32x2_t d_sum;
+ // sum += w[ 0] * abs(b0);
+ // sum += w[ 4] * abs(b1);
+ // sum += w[ 8] * abs(b2);
+ // sum += w[12] * abs(b3);
+ int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
+ int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
+ int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
+ int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
+ q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
+ q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
+ q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
+ q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
+
+ q_sum0 = vaddq_s32(q_sum0, q_sum1);
+ q_sum2 = vaddq_s32(q_sum2, q_sum3);
+ q_sum2 = vaddq_s32(q_sum0, q_sum2);
+ d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
+ d_sum = vpadd_s32(d_sum, d_sum);
+ return d_sum;
+}
+
+#define LOAD_LANE_32b(src, VALUE, LANE) \
+ (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
+ uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
+ uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
+ uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
+ uint8x8x4_t d4_in;
+
+ // load data a, b
+ LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
+ LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
+ LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
+ LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
+ LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
+ LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
+ LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
+ LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
+ INIT_VECTOR4(d4_in,
+ vreinterpret_u8_u32(d_in_ab_0123),
+ vreinterpret_u8_u32(d_in_ab_4567),
+ vreinterpret_u8_u32(d_in_ab_89ab),
+ vreinterpret_u8_u32(d_in_ab_cdef));
+
+ {
+ // Vertical pass first to avoid a transpose (vertical and horizontal passes
+ // are commutative because w/kWeightY is symmetric) and subsequent
+ // transpose.
+ const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
+ const int16x4x4_t d4_w = DistoLoadW_NEON(w);
+ // horizontal pass
+ const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
+ const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
+ int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
+
+ // abs(sum2 - sum1) >> 5
+ d_sum = vabs_s32(d_sum);
+ d_sum = vshr_n_s32(d_sum, 5);
+ return vget_lane_s32(d_sum, 0);
+ }
+}
+#undef LOAD_LANE_32b
+
+static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ int D = 0;
+ int x, y;
+ for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+ for (x = 0; x < 16; x += 4) {
+ D += Disto4x4_NEON(a + x + y, b + x + y, w);
+ }
+ }
+ return D;
+}
+
+//------------------------------------------------------------------------------
+
+static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
+ const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
+ int j;
+ int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+ for (j = start_block; j < end_block; ++j) {
+ int16_t out[16];
+ FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+ {
+ int k;
+ const int16x8_t a0 = vld1q_s16(out + 0);
+ const int16x8_t b0 = vld1q_s16(out + 8);
+ const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
+ const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
+ const uint16x8_t a2 = vshrq_n_u16(a1, 3);
+ const uint16x8_t b2 = vshrq_n_u16(b1, 3);
+ const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
+ const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
+ vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
+ vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
+ // Convert coefficients to bin.
+ for (k = 0; k < 16; ++k) {
+ ++distribution[out[k]];
+ }
+ }
+ }
+ VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
+ const uint8_t* const b,
+ uint32x4_t* const sum) {
+ const uint8x16_t a0 = vld1q_u8(a);
+ const uint8x16_t b0 = vld1q_u8(b);
+ const uint8x16_t abs_diff = vabdq_u8(a0, b0);
+ const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+ vget_low_u8(abs_diff));
+ const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+ vget_high_u8(abs_diff));
+ /* pair-wise adds and widen */
+ const uint32x4_t sum1 = vpaddlq_u16(prod1);
+ const uint32x4_t sum2 = vpaddlq_u16(prod2);
+ *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
+}
+
+// Horizontal sum of all four uint32_t values in 'sum'.
+static int SumToInt_NEON(uint32x4_t sum) {
+ const uint64x2_t sum2 = vpaddlq_u32(sum);
+ const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
+ return (int)sum3;
+}
+
+static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
+ uint32x4_t sum = vdupq_n_u32(0);
+ int y;
+ for (y = 0; y < 16; ++y) {
+ AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
+ }
+ return SumToInt_NEON(sum);
+}
+
+static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
+ uint32x4_t sum = vdupq_n_u32(0);
+ int y;
+ for (y = 0; y < 8; ++y) {
+ AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
+ }
+ return SumToInt_NEON(sum);
+}
+
+static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
+ uint32x4_t sum = vdupq_n_u32(0);
+ int y;
+ for (y = 0; y < 8; ++y) {
+ const uint8x8_t a0 = vld1_u8(a + y * BPS);
+ const uint8x8_t b0 = vld1_u8(b + y * BPS);
+ const uint8x8_t abs_diff = vabd_u8(a0, b0);
+ const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
+ sum = vpadalq_u16(sum, prod);
+ }
+ return SumToInt_NEON(sum);
+}
+
+static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
+ const uint8x16_t a0 = Load4x4_NEON(a);
+ const uint8x16_t b0 = Load4x4_NEON(b);
+ const uint8x16_t abs_diff = vabdq_u8(a0, b0);
+ const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+ vget_low_u8(abs_diff));
+ const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+ vget_high_u8(abs_diff));
+ /* pair-wise adds and widen */
+ const uint32x4_t sum1 = vpaddlq_u16(prod1);
+ const uint32x4_t sum2 = vpaddlq_u16(prod2);
+ return SumToInt_NEON(vaddq_u32(sum1, sum2));
+}
+
+//------------------------------------------------------------------------------
+
+// Compilation with gcc-4.6.x is problematic for now.
+#if !defined(WORK_AROUND_GCC)
+
+static int16x8_t Quantize_NEON(int16_t* const in,
+ const VP8Matrix* const mtx, int offset) {
+ const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
+ const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
+ const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
+ const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
+ const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
+
+ const int16x8_t a = vld1q_s16(in + offset); // in
+ const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
+ const int16x8_t sign = vshrq_n_s16(a, 15); // sign
+ const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
+ const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
+ const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
+ const uint32x4_t m2 = vhaddq_u32(m0, bias0);
+ const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
+ const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
+ vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
+ const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
+ const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
+ const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
+ const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
+ vst1q_s16(in + offset, c4);
+ assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
+ return c3;
+}
+
+static const uint8_t kShuffles[4][8] = {
+ { 0, 1, 2, 3, 8, 9, 16, 17 },
+ { 10, 11, 4, 5, 6, 7, 12, 13 },
+ { 18, 19, 24, 25, 26, 27, 20, 21 },
+ { 14, 15, 22, 23, 28, 29, 30, 31 }
+};
+
+static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
+ const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
+ uint8x8x4_t shuffles;
+ // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+ // non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+ defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+ uint8x16x2_t all_out;
+ INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
+ INIT_VECTOR4(shuffles,
+ vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
+ vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
+ vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
+ vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
+#else
+ uint8x8x4_t all_out;
+ INIT_VECTOR4(all_out,
+ vreinterpret_u8_s16(vget_low_s16(out0)),
+ vreinterpret_u8_s16(vget_high_s16(out0)),
+ vreinterpret_u8_s16(vget_low_s16(out1)),
+ vreinterpret_u8_s16(vget_high_s16(out1)));
+ INIT_VECTOR4(shuffles,
+ vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
+ vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
+ vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
+ vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+#endif
+ // Zigzag reordering
+ vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
+ vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
+ vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
+ vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
+ // test zeros
+ if (*(uint64_t*)(out + 0) != 0) return 1;
+ if (*(uint64_t*)(out + 4) != 0) return 1;
+ if (*(uint64_t*)(out + 8) != 0) return 1;
+ if (*(uint64_t*)(out + 12) != 0) return 1;
+ return 0;
+}
+
+static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
+ int nz;
+ nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
+ nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
+ return nz;
+}
+
+#endif // !WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
+ VP8ITransform = ITransform_NEON;
+ VP8FTransform = FTransform_NEON;
+
+ VP8FTransformWHT = FTransformWHT_NEON;
+
+ VP8TDisto4x4 = Disto4x4_NEON;
+ VP8TDisto16x16 = Disto16x16_NEON;
+ VP8CollectHistogram = CollectHistogram_NEON;
+
+ VP8SSE16x16 = SSE16x16_NEON;
+ VP8SSE16x8 = SSE16x8_NEON;
+ VP8SSE8x8 = SSE8x8_NEON;
+ VP8SSE4x4 = SSE4x4_NEON;
+
+#if !defined(WORK_AROUND_GCC)
+ VP8EncQuantizeBlock = QuantizeBlock_NEON;
+ VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
+#endif
+}
+
+#else // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
+
+#endif // WEBP_USE_NEON
diff --git a/media/libwebp/dsp/enc_sse2.c b/media/libwebp/dsp/enc_sse2.c
new file mode 100644
index 0000000000..ff78755111
--- /dev/null
+++ b/media/libwebp/dsp/enc_sse2.c
@@ -0,0 +1,1381 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of speed-critical encoding functions.
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <assert.h>
+#include <stdlib.h> // for abs()
+#include <emmintrin.h>
+
+#include "../dsp/common_sse2.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Does one or two inverse transforms.
+static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+ int do_two) {
+ // This implementation makes use of 16-bit fixed point versions of two
+ // multiply constants:
+ // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+ // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+ //
+ // To be able to use signed 16-bit integers, we use the following trick to
+ // have constants within range:
+ // - Associated constants are obtained by subtracting the 16-bit fixed point
+ // version of one:
+ // k = K - (1 << 16) => K = k + (1 << 16)
+ // K1 = 85267 => k1 = 20091
+ // K2 = 35468 => k2 = -30068
+ // - The multiplication of a variable by a constant become the sum of the
+ // variable and the multiplication of that variable by the associated
+ // constant:
+ // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+ const __m128i k1 = _mm_set1_epi16(20091);
+ const __m128i k2 = _mm_set1_epi16(-30068);
+ __m128i T0, T1, T2, T3;
+
+ // Load and concatenate the transform coefficients (we'll do two inverse
+ // transforms in parallel). In the case of only one inverse transform, the
+ // second half of the vectors will just contain random value we'll never
+ // use nor store.
+ __m128i in0, in1, in2, in3;
+ {
+ in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
+ in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
+ in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
+ in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
+ // a00 a10 a20 a30 x x x x
+ // a01 a11 a21 a31 x x x x
+ // a02 a12 a22 a32 x x x x
+ // a03 a13 a23 a33 x x x x
+ if (do_two) {
+ const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
+ const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
+ const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
+ const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
+ in0 = _mm_unpacklo_epi64(in0, inB0);
+ in1 = _mm_unpacklo_epi64(in1, inB1);
+ in2 = _mm_unpacklo_epi64(in2, inB2);
+ in3 = _mm_unpacklo_epi64(in3, inB3);
+ // a00 a10 a20 a30 b00 b10 b20 b30
+ // a01 a11 a21 a31 b01 b11 b21 b31
+ // a02 a12 a22 a32 b02 b12 b22 b32
+ // a03 a13 a23 a33 b03 b13 b23 b33
+ }
+ }
+
+ // Vertical pass and subsequent transpose.
+ {
+ // First pass, c and d calculations are longer because of the "trick"
+ // multiplications.
+ const __m128i a = _mm_add_epi16(in0, in2);
+ const __m128i b = _mm_sub_epi16(in0, in2);
+ // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+ const __m128i c1 = _mm_mulhi_epi16(in1, k2);
+ const __m128i c2 = _mm_mulhi_epi16(in3, k1);
+ const __m128i c3 = _mm_sub_epi16(in1, in3);
+ const __m128i c4 = _mm_sub_epi16(c1, c2);
+ const __m128i c = _mm_add_epi16(c3, c4);
+ // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+ const __m128i d1 = _mm_mulhi_epi16(in1, k1);
+ const __m128i d2 = _mm_mulhi_epi16(in3, k2);
+ const __m128i d3 = _mm_add_epi16(in1, in3);
+ const __m128i d4 = _mm_add_epi16(d1, d2);
+ const __m128i d = _mm_add_epi16(d3, d4);
+
+ // Second pass.
+ const __m128i tmp0 = _mm_add_epi16(a, d);
+ const __m128i tmp1 = _mm_add_epi16(b, c);
+ const __m128i tmp2 = _mm_sub_epi16(b, c);
+ const __m128i tmp3 = _mm_sub_epi16(a, d);
+
+ // Transpose the two 4x4.
+ VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);
+ }
+
+ // Horizontal pass and subsequent transpose.
+ {
+ // First pass, c and d calculations are longer because of the "trick"
+ // multiplications.
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i dc = _mm_add_epi16(T0, four);
+ const __m128i a = _mm_add_epi16(dc, T2);
+ const __m128i b = _mm_sub_epi16(dc, T2);
+ // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+ const __m128i c1 = _mm_mulhi_epi16(T1, k2);
+ const __m128i c2 = _mm_mulhi_epi16(T3, k1);
+ const __m128i c3 = _mm_sub_epi16(T1, T3);
+ const __m128i c4 = _mm_sub_epi16(c1, c2);
+ const __m128i c = _mm_add_epi16(c3, c4);
+ // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+ const __m128i d1 = _mm_mulhi_epi16(T1, k1);
+ const __m128i d2 = _mm_mulhi_epi16(T3, k2);
+ const __m128i d3 = _mm_add_epi16(T1, T3);
+ const __m128i d4 = _mm_add_epi16(d1, d2);
+ const __m128i d = _mm_add_epi16(d3, d4);
+
+ // Second pass.
+ const __m128i tmp0 = _mm_add_epi16(a, d);
+ const __m128i tmp1 = _mm_add_epi16(b, c);
+ const __m128i tmp2 = _mm_sub_epi16(b, c);
+ const __m128i tmp3 = _mm_sub_epi16(a, d);
+ const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
+ const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
+ const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
+ const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
+
+ // Transpose the two 4x4.
+ VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
+ &T2, &T3);
+ }
+
+ // Add inverse transform to 'ref' and store.
+ {
+ const __m128i zero = _mm_setzero_si128();
+ // Load the reference(s).
+ __m128i ref0, ref1, ref2, ref3;
+ if (do_two) {
+ // Load eight bytes/pixels per line.
+ ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+ ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+ ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+ ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+ } else {
+ // Load four bytes/pixels per line.
+ ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS]));
+ ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS]));
+ ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS]));
+ ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS]));
+ }
+ // Convert to 16b.
+ ref0 = _mm_unpacklo_epi8(ref0, zero);
+ ref1 = _mm_unpacklo_epi8(ref1, zero);
+ ref2 = _mm_unpacklo_epi8(ref2, zero);
+ ref3 = _mm_unpacklo_epi8(ref3, zero);
+ // Add the inverse transform(s).
+ ref0 = _mm_add_epi16(ref0, T0);
+ ref1 = _mm_add_epi16(ref1, T1);
+ ref2 = _mm_add_epi16(ref2, T2);
+ ref3 = _mm_add_epi16(ref3, T3);
+ // Unsigned saturate to 8b.
+ ref0 = _mm_packus_epi16(ref0, ref0);
+ ref1 = _mm_packus_epi16(ref1, ref1);
+ ref2 = _mm_packus_epi16(ref2, ref2);
+ ref3 = _mm_packus_epi16(ref3, ref3);
+ // Store the results.
+ if (do_two) {
+ // Store eight bytes/pixels per line.
+ _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
+ _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
+ _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
+ _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
+ } else {
+ // Store four bytes/pixels per line.
+ WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
+ WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
+ WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
+ WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
+ }
+ }
+}
+
+static void FTransformPass1_SSE2(const __m128i* const in01,
+ const __m128i* const in23,
+ __m128i* const out01,
+ __m128i* const out32) {
+ const __m128i k937 = _mm_set1_epi32(937);
+ const __m128i k1812 = _mm_set1_epi32(1812);
+
+ const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
+ const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
+ const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
+ 2217, 5352, 2217, 5352);
+ const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
+ -5352, 2217, -5352, 2217);
+
+ // *in01 = 00 01 10 11 02 03 12 13
+ // *in23 = 20 21 30 31 22 23 32 33
+ const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1));
+ // 00 01 10 11 03 02 13 12
+ // 20 21 30 31 23 22 33 32
+ const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
+ const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
+ // 00 01 10 11 20 21 30 31
+ // 03 02 13 12 23 22 33 32
+ const __m128i a01 = _mm_add_epi16(s01, s32);
+ const __m128i a32 = _mm_sub_epi16(s01, s32);
+ // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+ // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
+
+ const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ]
+ const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ]
+ const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
+ const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
+ const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
+ const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
+ const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9);
+ const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9);
+ const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
+ const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
+ const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1...
+ const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3
+ const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
+ *out01 = _mm_unpacklo_epi32(s_lo, s_hi);
+ *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2..
+}
+
+static void FTransformPass2_SSE2(const __m128i* const v01,
+ const __m128i* const v32,
+ int16_t* out) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i seven = _mm_set1_epi16(7);
+ const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
+ 5352, 2217, 5352, 2217);
+ const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
+ 2217, -5352, 2217, -5352);
+ const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
+ const __m128i k51000 = _mm_set1_epi32(51000);
+
+ // Same operations are done on the (0,3) and (1,2) pairs.
+ // a3 = v0 - v3
+ // a2 = v1 - v2
+ const __m128i a32 = _mm_sub_epi16(*v01, *v32);
+ const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+
+ const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
+ const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+ const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+ const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
+ const __m128i d3 = _mm_add_epi32(c3, k51000);
+ const __m128i e1 = _mm_srai_epi32(d1, 16);
+ const __m128i e3 = _mm_srai_epi32(d3, 16);
+ // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+ // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+ const __m128i f1 = _mm_packs_epi32(e1, e1);
+ const __m128i f3 = _mm_packs_epi32(e3, e3);
+ // g1 = f1 + (a3 != 0);
+ // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+ // desired (0, 1), we add one earlier through k12000_plus_one.
+ // -> g1 = f1 + 1 - (a3 == 0)
+ const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
+
+ // a0 = v0 + v3
+ // a1 = v1 + v2
+ const __m128i a01 = _mm_add_epi16(*v01, *v32);
+ const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
+ const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+ const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+ const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
+ // d0 = (a0 + a1 + 7) >> 4;
+ // d2 = (a0 - a1 + 7) >> 4;
+ const __m128i d0 = _mm_srai_epi16(c0, 4);
+ const __m128i d2 = _mm_srai_epi16(c2, 4);
+
+ const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
+ const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
+ _mm_storeu_si128((__m128i*)&out[0], d0_g1);
+ _mm_storeu_si128((__m128i*)&out[8], d2_f3);
+}
+
+static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
+ const __m128i zero = _mm_setzero_si128();
+ // Load src.
+ const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+ const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+ const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+ const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+ // 00 01 02 03 *
+ // 10 11 12 13 *
+ // 20 21 22 23 *
+ // 30 31 32 33 *
+ // Shuffle.
+ const __m128i src_0 = _mm_unpacklo_epi16(src0, src1);
+ const __m128i src_1 = _mm_unpacklo_epi16(src2, src3);
+ // 00 01 10 11 02 03 12 13 * * ...
+ // 20 21 30 31 22 22 32 33 * * ...
+
+ // Load ref.
+ const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+ const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+ const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+ const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+ const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1);
+ const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3);
+
+ // Convert both to 16 bit.
+ const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero);
+ const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero);
+ const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero);
+ const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero);
+
+ // Compute the difference.
+ const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b);
+ const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b);
+ __m128i v01, v32;
+
+ // First pass
+ FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
+
+ // Second pass
+ FTransformPass2_SSE2(&v01, &v32, out);
+}
+
+static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
+ const __m128i zero = _mm_setzero_si128();
+
+ // Load src and convert to 16b.
+ const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+ const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+ const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+ const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+ const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+ const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+ const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+ const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+ // Load ref and convert to 16b.
+ const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+ const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+ const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+ const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+ const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+ const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+ const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+ const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+ // Compute difference. -> 00 01 02 03 00' 01' 02' 03'
+ const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+ const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+ const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+ const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+ // Unpack and shuffle
+ // 00 01 02 03 0 0 0 0
+ // 10 11 12 13 0 0 0 0
+ // 20 21 22 23 0 0 0 0
+ // 30 31 32 33 0 0 0 0
+ const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1);
+ const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3);
+ const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1);
+ const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3);
+ __m128i v01l, v32l;
+ __m128i v01h, v32h;
+
+ // First pass
+ FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
+ FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
+
+ // Second pass
+ FTransformPass2_SSE2(&v01l, &v32l, out + 0);
+ FTransformPass2_SSE2(&v01h, &v32h, out + 16);
+}
+
+static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
+ const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
+ const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
+ const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
+ const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]);
+ const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]);
+ const __m128i A01 = _mm_unpacklo_epi16(src0, src1); // A0 A1 | ...
+ const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 | ...
+ const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 | a1 | ...
+ const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 | a2 | ...
+ const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 | ...
+ const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 | ...
+ const __m128i D = _mm_unpacklo_epi64(C0, C1); // a0 a1 a3 a2 a3 a2 a0 a1
+ *out = _mm_madd_epi16(D, kMult);
+}
+
+static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
+ // Input is 12b signed.
+ __m128i row0, row1, row2, row3;
+ // Rows are 14b signed.
+ FTransformWHTRow_SSE2(in + 0 * 64, &row0);
+ FTransformWHTRow_SSE2(in + 1 * 64, &row1);
+ FTransformWHTRow_SSE2(in + 2 * 64, &row2);
+ FTransformWHTRow_SSE2(in + 3 * 64, &row3);
+
+ {
+ // The a* are 15b signed.
+ const __m128i a0 = _mm_add_epi32(row0, row2);
+ const __m128i a1 = _mm_add_epi32(row1, row3);
+ const __m128i a2 = _mm_sub_epi32(row1, row3);
+ const __m128i a3 = _mm_sub_epi32(row0, row2);
+ const __m128i a0a3 = _mm_packs_epi32(a0, a3);
+ const __m128i a1a2 = _mm_packs_epi32(a1, a2);
+
+ // The b* are 16b signed.
+ const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2);
+ const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2);
+ const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2);
+ const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2);
+
+ _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1));
+ _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1));
+ }
+}
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+ int j;
+ int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+ for (j = start_block; j < end_block; ++j) {
+ int16_t out[16];
+ int k;
+
+ FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+ // Convert coefficients to bin (within out[]).
+ {
+ // Load.
+ const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+ const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+ const __m128i d0 = _mm_sub_epi16(zero, out0);
+ const __m128i d1 = _mm_sub_epi16(zero, out1);
+ const __m128i abs0 = _mm_max_epi16(out0, d0); // abs(v), 16b
+ const __m128i abs1 = _mm_max_epi16(out1, d1);
+ // v = abs(out) >> 3
+ const __m128i v0 = _mm_srai_epi16(abs0, 3);
+ const __m128i v1 = _mm_srai_epi16(abs1, 3);
+ // bin = min(v, MAX_COEFF_THRESH)
+ const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+ const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+ // Store.
+ _mm_storeu_si128((__m128i*)&out[0], bin0);
+ _mm_storeu_si128((__m128i*)&out[8], bin1);
+ }
+
+ // Convert coefficients to bin.
+ for (k = 0; k < 16; ++k) {
+ ++distribution[out[k]];
+ }
+ }
+ VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
+ int j;
+ const __m128i values = _mm_set1_epi8(v);
+ for (j = 0; j < 8; ++j) {
+ _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+ }
+}
+
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
+ int j;
+ const __m128i values = _mm_set1_epi8(v);
+ for (j = 0; j < 16; ++j) {
+ _mm_store_si128((__m128i*)(dst + j * BPS), values);
+ }
+}
+
+static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
+ if (size == 4) {
+ int j;
+ for (j = 0; j < 4; ++j) {
+ memset(dst + j * BPS, value, 4);
+ }
+ } else if (size == 8) {
+ Put8x8uv_SSE2(value, dst);
+ } else {
+ Put16_SSE2(value, dst);
+ }
+}
+
+static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
+ int j;
+ const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+ for (j = 0; j < 8; ++j) {
+ _mm_storel_epi64((__m128i*)(dst + j * BPS), top_values);
+ }
+}
+
+static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
+ const __m128i top_values = _mm_load_si128((const __m128i*)top);
+ int j;
+ for (j = 0; j < 16; ++j) {
+ _mm_store_si128((__m128i*)(dst + j * BPS), top_values);
+ }
+}
+
+static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
+ const uint8_t* top, int size) {
+ if (top != NULL) {
+ if (size == 8) {
+ VE8uv_SSE2(dst, top);
+ } else {
+ VE16_SSE2(dst, top);
+ }
+ } else {
+ Fill_SSE2(dst, 127, size);
+ }
+}
+
+static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
+ int j;
+ for (j = 0; j < 8; ++j) {
+ const __m128i values = _mm_set1_epi8(left[j]);
+ _mm_storel_epi64((__m128i*)dst, values);
+ dst += BPS;
+ }
+}
+
+static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
+ int j;
+ for (j = 0; j < 16; ++j) {
+ const __m128i values = _mm_set1_epi8(left[j]);
+ _mm_store_si128((__m128i*)dst, values);
+ dst += BPS;
+ }
+}
+
+static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
+ const uint8_t* left, int size) {
+ if (left != NULL) {
+ if (size == 8) {
+ HE8uv_SSE2(dst, left);
+ } else {
+ HE16_SSE2(dst, left);
+ }
+ } else {
+ Fill_SSE2(dst, 129, size);
+ }
+}
+
+static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top, int size) {
+ const __m128i zero = _mm_setzero_si128();
+ int y;
+ if (size == 8) {
+ const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+ for (y = 0; y < 8; ++y, dst += BPS) {
+ const int val = left[y] - left[-1];
+ const __m128i base = _mm_set1_epi16(val);
+ const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+ _mm_storel_epi64((__m128i*)dst, out);
+ }
+ } else {
+ const __m128i top_values = _mm_load_si128((const __m128i*)top);
+ const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+ const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+ for (y = 0; y < 16; ++y, dst += BPS) {
+ const int val = left[y] - left[-1];
+ const __m128i base = _mm_set1_epi16(val);
+ const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+ const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+ const __m128i out = _mm_packus_epi16(out_0, out_1);
+ _mm_store_si128((__m128i*)dst, out);
+ }
+ }
+}
+
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top, int size) {
+ if (left != NULL) {
+ if (top != NULL) {
+ TM_SSE2(dst, left, top, size);
+ } else {
+ HorizontalPred_SSE2(dst, left, size);
+ }
+ } else {
+ // true motion without left samples (hence: with default 129 value)
+ // is equivalent to VE prediction where you just copy the top samples.
+ // Note that if top samples are not available, the default value is
+ // then 129, and not 127 as in the VerticalPred case.
+ if (top != NULL) {
+ VerticalPred_SSE2(dst, top, size);
+ } else {
+ Fill_SSE2(dst, 129, size);
+ }
+ }
+}
+
+static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
+ const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
+ const int DC = VP8HorizontalAdd8b(&combined) + 8;
+ Put8x8uv_SSE2(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i sum = _mm_sad_epu8(top_values, zero);
+ const int DC = _mm_cvtsi128_si32(sum) + 4;
+ Put8x8uv_SSE2(DC >> 3, dst);
+}
+
+static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+ // 'left' is contiguous so we can reuse the top summation.
+ DC8uvNoLeft_SSE2(dst, left);
+}
+
+static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
+ Put8x8uv_SSE2(0x80, dst);
+}
+
+static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ if (top != NULL) {
+ if (left != NULL) { // top and left present
+ DC8uv_SSE2(dst, left, top);
+ } else { // top, but no left
+ DC8uvNoLeft_SSE2(dst, top);
+ }
+ } else if (left != NULL) { // left but no top
+ DC8uvNoTop_SSE2(dst, left);
+ } else { // no top, no left, nothing.
+ DC8uvNoTopLeft_SSE2(dst);
+ }
+}
+
+static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ const __m128i top_row = _mm_load_si128((const __m128i*)top);
+ const __m128i left_row = _mm_load_si128((const __m128i*)left);
+ const int DC =
+ VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
+ Put16_SSE2(DC >> 5, dst);
+}
+
+static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+ const __m128i top_row = _mm_load_si128((const __m128i*)top);
+ const int DC = VP8HorizontalAdd8b(&top_row) + 8;
+ Put16_SSE2(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+ // 'left' is contiguous so we can reuse the top summation.
+ DC16NoLeft_SSE2(dst, left);
+}
+
+static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
+ Put16_SSE2(0x80, dst);
+}
+
+static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ if (top != NULL) {
+ if (left != NULL) { // top and left present
+ DC16_SSE2(dst, left, top);
+ } else { // top, but no left
+ DC16NoLeft_SSE2(dst, top);
+ }
+ } else if (left != NULL) { // left but no top
+ DC16NoTop_SSE2(dst, left);
+ } else { // no top, no left, nothing.
+ DC16NoTopLeft_SSE2(dst);
+ }
+}
+
+//------------------------------------------------------------------------------
+// 4x4 predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+// We use the following 8b-arithmetic tricks:
+// (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
+// where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
+// and:
+// (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
+// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
+// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
+
+static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // vertical
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
+ const __m128i b = _mm_subs_epu8(a, lsb);
+ const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
+ const uint32_t vals = _mm_cvtsi128_si32(avg);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ WebPUint32ToMem(dst + i * BPS, vals);
+ }
+}
+
+static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // horizontal
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+ WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+ WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+ WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
+ uint32_t dc = 4;
+ int i;
+ for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+ Fill_SSE2(dst, dc >> 3, 4);
+}
+
+static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // Down-Left
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3);
+ const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
+ const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+ const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
+ WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg ));
+ WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+ WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+ WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // Vertical-Right
+ const __m128i one = _mm_set1_epi8(1);
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int X = top[-1];
+ const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1));
+ const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
+ const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
+ const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
+ const __m128i IXABCD = _mm_insert_epi16(_XABCD, (short)(I | (X << 8)), 0);
+ const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
+ const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+ const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
+ WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd ));
+ WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh ));
+ WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+ WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+
+ // these two are hard to implement in SSE2, so we keep the C-version:
+ DST(0, 2) = AVG3(J, I, X);
+ DST(0, 3) = AVG3(K, J, I);
+}
+
+static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // Vertical-Left
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
+ const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
+ const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
+ const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
+ const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
+ const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
+ const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
+ const __m128i abbc = _mm_or_si128(ab, bc);
+ const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
+ const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
+ const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+ WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 ));
+ WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 ));
+ WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+ WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+
+ // these two are hard to get and irregular
+ DST(3, 2) = (extra_out >> 0) & 0xff;
+ DST(3, 3) = (extra_out >> 8) & 0xff;
+}
+
+static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // Down-right
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
+ const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
+ const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
+ const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
+ const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
+ const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+ const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
+ WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg ));
+ WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+ WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+ WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ DST(0, 0) = AVG2(I, J);
+ DST(2, 0) = DST(0, 1) = AVG2(J, K);
+ DST(2, 1) = DST(0, 2) = AVG2(K, L);
+ DST(1, 0) = AVG3(I, J, K);
+ DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+ DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+ DST(3, 2) = DST(2, 2) =
+ DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+
+ DST(0, 0) = DST(2, 1) = AVG2(I, X);
+ DST(0, 1) = DST(2, 2) = AVG2(J, I);
+ DST(0, 2) = DST(2, 3) = AVG2(K, J);
+ DST(0, 3) = AVG2(L, K);
+
+ DST(3, 0) = AVG3(A, B, C);
+ DST(2, 0) = AVG3(X, A, B);
+ DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+ DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+ DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+ DST(1, 3) = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+ const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+ int y;
+ for (y = 0; y < 4; ++y, dst += BPS) {
+ const int val = top[-2 - y] - top[-1];
+ const __m128i base = _mm_set1_epi16(val);
+ const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+ WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+ }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
+ DC4_SSE2(I4DC4 + dst, top);
+ TM4_SSE2(I4TM4 + dst, top);
+ VE4_SSE2(I4VE4 + dst, top);
+ HE4_SSE2(I4HE4 + dst, top);
+ RD4_SSE2(I4RD4 + dst, top);
+ VR4_SSE2(I4VR4 + dst, top);
+ LD4_SSE2(I4LD4 + dst, top);
+ VL4_SSE2(I4VL4 + dst, top);
+ HD4_SSE2(I4HD4 + dst, top);
+ HU4_SSE2(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ // U block
+ DC8uvMode_SSE2(C8DC8 + dst, left, top);
+ VerticalPred_SSE2(C8VE8 + dst, top, 8);
+ HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+ TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
+ // V block
+ dst += 8;
+ if (top != NULL) top += 8;
+ if (left != NULL) left += 16;
+ DC8uvMode_SSE2(C8DC8 + dst, left, top);
+ VerticalPred_SSE2(C8VE8 + dst, top, 8);
+ HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+ TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds_SSE2(uint8_t* dst,
+ const uint8_t* left, const uint8_t* top) {
+ DC16Mode_SSE2(I16DC16 + dst, left, top);
+ VerticalPred_SSE2(I16VE16 + dst, top, 16);
+ HorizontalPred_SSE2(I16HE16 + dst, left, 16);
+ TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
+ const __m128i b,
+ __m128i* const sum) {
+ // take abs(a-b) in 8b
+ const __m128i a_b = _mm_subs_epu8(a, b);
+ const __m128i b_a = _mm_subs_epu8(b, a);
+ const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+ // zero-extend to 16b
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+ const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+ // multiply with self
+ const __m128i sum1 = _mm_madd_epi16(C0, C0);
+ const __m128i sum2 = _mm_madd_epi16(C1, C1);
+ *sum = _mm_add_epi32(sum1, sum2);
+}
+
+static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
+ int num_pairs) {
+ __m128i sum = _mm_setzero_si128();
+ int32_t tmp[4];
+ int i;
+
+ for (i = 0; i < num_pairs; ++i) {
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[BPS * 0]);
+ const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[BPS * 0]);
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
+ const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
+ __m128i sum1, sum2;
+ SubtractAndAccumulate_SSE2(a0, b0, &sum1);
+ SubtractAndAccumulate_SSE2(a1, b1, &sum2);
+ sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
+ a += 2 * BPS;
+ b += 2 * BPS;
+ }
+ _mm_storeu_si128((__m128i*)tmp, sum);
+ return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
+ return SSE_16xN_SSE2(a, b, 8);
+}
+
+static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
+ return SSE_16xN_SSE2(a, b, 4);
+}
+
+#define LOAD_8x16b(ptr) \
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
+
+static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
+ const __m128i zero = _mm_setzero_si128();
+ int num_pairs = 4;
+ __m128i sum = zero;
+ int32_t tmp[4];
+ while (num_pairs-- > 0) {
+ const __m128i a0 = LOAD_8x16b(&a[BPS * 0]);
+ const __m128i a1 = LOAD_8x16b(&a[BPS * 1]);
+ const __m128i b0 = LOAD_8x16b(&b[BPS * 0]);
+ const __m128i b1 = LOAD_8x16b(&b[BPS * 1]);
+ // subtract
+ const __m128i c0 = _mm_subs_epi16(a0, b0);
+ const __m128i c1 = _mm_subs_epi16(a1, b1);
+ // multiply/accumulate with self
+ const __m128i d0 = _mm_madd_epi16(c0, c0);
+ const __m128i d1 = _mm_madd_epi16(c1, c1);
+ // collect
+ const __m128i sum01 = _mm_add_epi32(d0, d1);
+ sum = _mm_add_epi32(sum, sum01);
+ a += 2 * BPS;
+ b += 2 * BPS;
+ }
+ _mm_storeu_si128((__m128i*)tmp, sum);
+ return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+#undef LOAD_8x16b
+
+static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
+ const __m128i zero = _mm_setzero_si128();
+
+ // Load values. Note that we read 8 pixels instead of 4,
+ // but the a/b buffers are over-allocated to that effect.
+ const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]);
+ const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]);
+ const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]);
+ const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]);
+ const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]);
+ const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]);
+ const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]);
+ const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]);
+ // Combine pair of lines.
+ const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
+ const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
+ // Convert to 16b.
+ const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
+ const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
+ const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
+ const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
+ // subtract, square and accumulate
+ const __m128i d0 = _mm_subs_epi16(a01s, b01s);
+ const __m128i d1 = _mm_subs_epi16(a23s, b23s);
+ const __m128i e0 = _mm_madd_epi16(d0, d0);
+ const __m128i e1 = _mm_madd_epi16(d1, d1);
+ const __m128i sum = _mm_add_epi32(e0, e1);
+
+ int32_t tmp[4];
+ _mm_storeu_si128((__m128i*)tmp, sum);
+ return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+//------------------------------------------------------------------------------
+
+static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
+ const __m128i mask = _mm_set1_epi16(0x00ff);
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
+ const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]);
+ const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]);
+ const __m128i b0 = _mm_srli_epi16(a0, 8); // hi byte
+ const __m128i b1 = _mm_srli_epi16(a1, 8);
+ const __m128i b2 = _mm_srli_epi16(a2, 8);
+ const __m128i b3 = _mm_srli_epi16(a3, 8);
+ const __m128i c0 = _mm_and_si128(a0, mask); // lo byte
+ const __m128i c1 = _mm_and_si128(a1, mask);
+ const __m128i c2 = _mm_and_si128(a2, mask);
+ const __m128i c3 = _mm_and_si128(a3, mask);
+ const __m128i d0 = _mm_add_epi32(b0, c0);
+ const __m128i d1 = _mm_add_epi32(b1, c1);
+ const __m128i d2 = _mm_add_epi32(b2, c2);
+ const __m128i d3 = _mm_add_epi32(b3, c3);
+ const __m128i e0 = _mm_add_epi32(d0, d1);
+ const __m128i e1 = _mm_add_epi32(d2, d3);
+ const __m128i f0 = _mm_add_epi32(e0, e1);
+ uint16_t tmp[8];
+ _mm_storeu_si128((__m128i*)tmp, f0);
+ dc[0] = tmp[0] + tmp[1];
+ dc[1] = tmp[2] + tmp[3];
+ dc[2] = tmp[4] + tmp[5];
+ dc[3] = tmp[6] + tmp[7];
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
+ const uint16_t* const w) {
+ int32_t sum[4];
+ __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+ const __m128i zero = _mm_setzero_si128();
+
+ // Load and combine inputs.
+ {
+ const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
+ const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
+ const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+ const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+ const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
+ const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
+ const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+ const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
+
+ // Combine inA and inB (we'll do two transforms in parallel).
+ const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
+ const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
+ const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
+ const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
+ tmp_0 = _mm_unpacklo_epi8(inAB_0, zero);
+ tmp_1 = _mm_unpacklo_epi8(inAB_1, zero);
+ tmp_2 = _mm_unpacklo_epi8(inAB_2, zero);
+ tmp_3 = _mm_unpacklo_epi8(inAB_3, zero);
+ // a00 a01 a02 a03 b00 b01 b02 b03
+ // a10 a11 a12 a13 b10 b11 b12 b13
+ // a20 a21 a22 a23 b20 b21 b22 b23
+ // a30 a31 a32 a33 b30 b31 b32 b33
+ }
+
+ // Vertical pass first to avoid a transpose (vertical and horizontal passes
+ // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+ {
+ // Calculate a and b (two 4x4 at once).
+ const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+ const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+ const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+ const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+ const __m128i b0 = _mm_add_epi16(a0, a1);
+ const __m128i b1 = _mm_add_epi16(a3, a2);
+ const __m128i b2 = _mm_sub_epi16(a3, a2);
+ const __m128i b3 = _mm_sub_epi16(a0, a1);
+ // a00 a01 a02 a03 b00 b01 b02 b03
+ // a10 a11 a12 a13 b10 b11 b12 b13
+ // a20 a21 a22 a23 b20 b21 b22 b23
+ // a30 a31 a32 a33 b30 b31 b32 b33
+
+ // Transpose the two 4x4.
+ VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
+ }
+
+ // Horizontal pass and difference of weighted sums.
+ {
+ // Load all inputs.
+ const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+ const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
+
+ // Calculate a and b (two 4x4 at once).
+ const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+ const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+ const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+ const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+ const __m128i b0 = _mm_add_epi16(a0, a1);
+ const __m128i b1 = _mm_add_epi16(a3, a2);
+ const __m128i b2 = _mm_sub_epi16(a3, a2);
+ const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+ // Separate the transforms of inA and inB.
+ __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+ __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+ __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+ __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+ {
+ const __m128i d0 = _mm_sub_epi16(zero, A_b0);
+ const __m128i d1 = _mm_sub_epi16(zero, A_b2);
+ const __m128i d2 = _mm_sub_epi16(zero, B_b0);
+ const __m128i d3 = _mm_sub_epi16(zero, B_b2);
+ A_b0 = _mm_max_epi16(A_b0, d0); // abs(v), 16b
+ A_b2 = _mm_max_epi16(A_b2, d1);
+ B_b0 = _mm_max_epi16(B_b0, d2);
+ B_b2 = _mm_max_epi16(B_b2, d3);
+ }
+
+ // weighted sums
+ A_b0 = _mm_madd_epi16(A_b0, w_0);
+ A_b2 = _mm_madd_epi16(A_b2, w_8);
+ B_b0 = _mm_madd_epi16(B_b0, w_0);
+ B_b2 = _mm_madd_epi16(B_b2, w_8);
+ A_b0 = _mm_add_epi32(A_b0, A_b2);
+ B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+ // difference of weighted sums
+ A_b0 = _mm_sub_epi32(A_b0, B_b0);
+ _mm_storeu_si128((__m128i*)&sum[0], A_b0);
+ }
+ return sum[0] + sum[1] + sum[2] + sum[3];
+}
+
+static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ const int diff_sum = TTransform_SSE2(a, b, w);
+ return abs(diff_sum) >> 5;
+}
+
+static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ int D = 0;
+ int x, y;
+ for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+ for (x = 0; x < 16; x += 4) {
+ D += Disto4x4_SSE2(a + x + y, b + x + y, w);
+ }
+ }
+ return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+ const uint16_t* const sharpen,
+ const VP8Matrix* const mtx) {
+ const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i coeff0, coeff8;
+ __m128i out0, out8;
+ __m128i packed_out;
+
+ // Load all inputs.
+ __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+ __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+ const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+ const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+ const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+ const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
+
+ // extract sign(in) (0x0000 if positive, 0xffff if negative)
+ const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
+ const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
+
+ // coeff = abs(in) = (in ^ sign) - sign
+ coeff0 = _mm_xor_si128(in0, sign0);
+ coeff8 = _mm_xor_si128(in8, sign8);
+ coeff0 = _mm_sub_epi16(coeff0, sign0);
+ coeff8 = _mm_sub_epi16(coeff8, sign8);
+
+ // coeff = abs(in) + sharpen
+ if (sharpen != NULL) {
+ const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+ const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
+ coeff0 = _mm_add_epi16(coeff0, sharpen0);
+ coeff8 = _mm_add_epi16(coeff8, sharpen8);
+ }
+
+ // out = (coeff * iQ + B) >> QFIX
+ {
+ // doing calculations with 32b precision (QFIX=17)
+ // out = (coeff * iQ)
+ const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+ const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+ const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+ const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+ __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+ __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+ __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+ __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+ // out = (coeff * iQ + B)
+ const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+ const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+ const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+ const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
+ out_00 = _mm_add_epi32(out_00, bias_00);
+ out_04 = _mm_add_epi32(out_04, bias_04);
+ out_08 = _mm_add_epi32(out_08, bias_08);
+ out_12 = _mm_add_epi32(out_12, bias_12);
+ // out = QUANTDIV(coeff, iQ, B, QFIX)
+ out_00 = _mm_srai_epi32(out_00, QFIX);
+ out_04 = _mm_srai_epi32(out_04, QFIX);
+ out_08 = _mm_srai_epi32(out_08, QFIX);
+ out_12 = _mm_srai_epi32(out_12, QFIX);
+
+ // pack result as 16b
+ out0 = _mm_packs_epi32(out_00, out_04);
+ out8 = _mm_packs_epi32(out_08, out_12);
+
+ // if (coeff > 2047) coeff = 2047
+ out0 = _mm_min_epi16(out0, max_coeff_2047);
+ out8 = _mm_min_epi16(out8, max_coeff_2047);
+ }
+
+ // get sign back (if (sign[j]) out_n = -out_n)
+ out0 = _mm_xor_si128(out0, sign0);
+ out8 = _mm_xor_si128(out8, sign8);
+ out0 = _mm_sub_epi16(out0, sign0);
+ out8 = _mm_sub_epi16(out8, sign8);
+
+ // in = out * Q
+ in0 = _mm_mullo_epi16(out0, q0);
+ in8 = _mm_mullo_epi16(out8, q8);
+
+ _mm_storeu_si128((__m128i*)&in[0], in0);
+ _mm_storeu_si128((__m128i*)&in[8], in8);
+
+ // zigzag the output before storing it.
+ //
+ // The zigzag pattern can almost be reproduced with a small sequence of
+ // shuffles. After it, we only need to swap the 7th (ending up in third
+ // position instead of twelfth) and 8th values.
+ {
+ __m128i outZ0, outZ8;
+ outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0));
+ outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
+ outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
+ outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1));
+ outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
+ outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
+ _mm_storeu_si128((__m128i*)&out[0], outZ0);
+ _mm_storeu_si128((__m128i*)&out[8], outZ8);
+ packed_out = _mm_packs_epi16(outZ0, outZ8);
+ }
+ {
+ const int16_t outZ_12 = out[12];
+ const int16_t outZ_3 = out[3];
+ out[3] = outZ_12;
+ out[12] = outZ_3;
+ }
+
+ // detect if all 'out' values are zeroes or not
+ return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
+}
+
+static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
+ int nz;
+ const uint16_t* const sharpen = &mtx->sharpen_[0];
+ nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+ nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+ return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
+ VP8CollectHistogram = CollectHistogram_SSE2;
+ VP8EncPredLuma16 = Intra16Preds_SSE2;
+ VP8EncPredChroma8 = IntraChromaPreds_SSE2;
+ VP8EncPredLuma4 = Intra4Preds_SSE2;
+ VP8EncQuantizeBlock = QuantizeBlock_SSE2;
+ VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
+ VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
+ VP8ITransform = ITransform_SSE2;
+ VP8FTransform = FTransform_SSE2;
+ VP8FTransform2 = FTransform2_SSE2;
+ VP8FTransformWHT = FTransformWHT_SSE2;
+ VP8SSE16x16 = SSE16x16_SSE2;
+ VP8SSE16x8 = SSE16x8_SSE2;
+ VP8SSE8x8 = SSE8x8_SSE2;
+ VP8SSE4x4 = SSE4x4_SSE2;
+ VP8TDisto4x4 = Disto4x4_SSE2;
+ VP8TDisto16x16 = Disto16x16_SSE2;
+ VP8Mean16x4 = Mean16x4_SSE2;
+}
+
+#else // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
+
+#endif // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/enc_sse41.c b/media/libwebp/dsp/enc_sse41.c
new file mode 100644
index 0000000000..09ea29361d
--- /dev/null
+++ b/media/libwebp/dsp/enc_sse41.c
@@ -0,0 +1,339 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 version of some encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <smmintrin.h>
+#include <stdlib.h> // for abs()
+
+#include "../dsp/common_sse2.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms.
+
+static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
+ const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+ int j;
+ int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+ for (j = start_block; j < end_block; ++j) {
+ int16_t out[16];
+ int k;
+
+ VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+ // Convert coefficients to bin (within out[]).
+ {
+ // Load.
+ const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+ const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+ // v = abs(out) >> 3
+ const __m128i abs0 = _mm_abs_epi16(out0);
+ const __m128i abs1 = _mm_abs_epi16(out1);
+ const __m128i v0 = _mm_srai_epi16(abs0, 3);
+ const __m128i v1 = _mm_srai_epi16(abs1, 3);
+ // bin = min(v, MAX_COEFF_THRESH)
+ const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+ const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+ // Store.
+ _mm_storeu_si128((__m128i*)&out[0], bin0);
+ _mm_storeu_si128((__m128i*)&out[8], bin1);
+ }
+
+ // Convert coefficients to bin.
+ for (k = 0; k < 16; ++k) {
+ ++distribution[out[k]];
+ }
+ }
+ VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
+ const uint16_t* const w) {
+ int32_t sum[4];
+ __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+
+ // Load and combine inputs.
+ {
+ const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
+ const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
+ const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
+ // In SSE4.1, with gcc 4.8 at least (maybe other versions),
+ // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
+ // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
+ // bound read.
+ const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+ const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
+ const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
+ const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
+ const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
+
+ // Combine inA and inB (we'll do two transforms in parallel).
+ const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
+ const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
+ const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
+ const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
+ tmp_0 = _mm_cvtepu8_epi16(inAB_0);
+ tmp_1 = _mm_cvtepu8_epi16(inAB_1);
+ tmp_2 = _mm_cvtepu8_epi16(inAB_2);
+ tmp_3 = _mm_cvtepu8_epi16(inAB_3);
+ // a00 a01 a02 a03 b00 b01 b02 b03
+ // a10 a11 a12 a13 b10 b11 b12 b13
+ // a20 a21 a22 a23 b20 b21 b22 b23
+ // a30 a31 a32 a33 b30 b31 b32 b33
+ }
+
+ // Vertical pass first to avoid a transpose (vertical and horizontal passes
+ // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+ {
+ // Calculate a and b (two 4x4 at once).
+ const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+ const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+ const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+ const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+ const __m128i b0 = _mm_add_epi16(a0, a1);
+ const __m128i b1 = _mm_add_epi16(a3, a2);
+ const __m128i b2 = _mm_sub_epi16(a3, a2);
+ const __m128i b3 = _mm_sub_epi16(a0, a1);
+ // a00 a01 a02 a03 b00 b01 b02 b03
+ // a10 a11 a12 a13 b10 b11 b12 b13
+ // a20 a21 a22 a23 b20 b21 b22 b23
+ // a30 a31 a32 a33 b30 b31 b32 b33
+
+ // Transpose the two 4x4.
+ VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
+ }
+
+ // Horizontal pass and difference of weighted sums.
+ {
+ // Load all inputs.
+ const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+ const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
+
+ // Calculate a and b (two 4x4 at once).
+ const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+ const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+ const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+ const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+ const __m128i b0 = _mm_add_epi16(a0, a1);
+ const __m128i b1 = _mm_add_epi16(a3, a2);
+ const __m128i b2 = _mm_sub_epi16(a3, a2);
+ const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+ // Separate the transforms of inA and inB.
+ __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+ __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+ __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+ __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+ A_b0 = _mm_abs_epi16(A_b0);
+ A_b2 = _mm_abs_epi16(A_b2);
+ B_b0 = _mm_abs_epi16(B_b0);
+ B_b2 = _mm_abs_epi16(B_b2);
+
+ // weighted sums
+ A_b0 = _mm_madd_epi16(A_b0, w_0);
+ A_b2 = _mm_madd_epi16(A_b2, w_8);
+ B_b0 = _mm_madd_epi16(B_b0, w_0);
+ B_b2 = _mm_madd_epi16(B_b2, w_8);
+ A_b0 = _mm_add_epi32(A_b0, A_b2);
+ B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+ // difference of weighted sums
+ A_b2 = _mm_sub_epi32(A_b0, B_b0);
+ _mm_storeu_si128((__m128i*)&sum[0], A_b2);
+ }
+ return sum[0] + sum[1] + sum[2] + sum[3];
+}
+
+static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ const int diff_sum = TTransform_SSE41(a, b, w);
+ return abs(diff_sum) >> 5;
+}
+
+static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ int D = 0;
+ int x, y;
+ for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+ for (x = 0; x < 16; x += 4) {
+ D += Disto4x4_SSE41(a + x + y, b + x + y, w);
+ }
+ }
+ return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// Generates a pshufb constant for shuffling 16b words.
+#define PSHUFB_CST(A,B,C,D,E,F,G,H) \
+ _mm_set_epi8(2 * (H) + 1, 2 * (H) + 0, 2 * (G) + 1, 2 * (G) + 0, \
+ 2 * (F) + 1, 2 * (F) + 0, 2 * (E) + 1, 2 * (E) + 0, \
+ 2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
+ 2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
+
+static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+ const uint16_t* const sharpen,
+ const VP8Matrix* const mtx) {
+ const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i out0, out8;
+ __m128i packed_out;
+
+ // Load all inputs.
+ __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+ __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+ const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+ const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+ const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+ const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
+
+ // coeff = abs(in)
+ __m128i coeff0 = _mm_abs_epi16(in0);
+ __m128i coeff8 = _mm_abs_epi16(in8);
+
+ // coeff = abs(in) + sharpen
+ if (sharpen != NULL) {
+ const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+ const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
+ coeff0 = _mm_add_epi16(coeff0, sharpen0);
+ coeff8 = _mm_add_epi16(coeff8, sharpen8);
+ }
+
+ // out = (coeff * iQ + B) >> QFIX
+ {
+ // doing calculations with 32b precision (QFIX=17)
+ // out = (coeff * iQ)
+ const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+ const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+ const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+ const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+ __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+ __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+ __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+ __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+ // out = (coeff * iQ + B)
+ const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+ const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+ const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+ const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
+ out_00 = _mm_add_epi32(out_00, bias_00);
+ out_04 = _mm_add_epi32(out_04, bias_04);
+ out_08 = _mm_add_epi32(out_08, bias_08);
+ out_12 = _mm_add_epi32(out_12, bias_12);
+ // out = QUANTDIV(coeff, iQ, B, QFIX)
+ out_00 = _mm_srai_epi32(out_00, QFIX);
+ out_04 = _mm_srai_epi32(out_04, QFIX);
+ out_08 = _mm_srai_epi32(out_08, QFIX);
+ out_12 = _mm_srai_epi32(out_12, QFIX);
+
+ // pack result as 16b
+ out0 = _mm_packs_epi32(out_00, out_04);
+ out8 = _mm_packs_epi32(out_08, out_12);
+
+ // if (coeff > 2047) coeff = 2047
+ out0 = _mm_min_epi16(out0, max_coeff_2047);
+ out8 = _mm_min_epi16(out8, max_coeff_2047);
+ }
+
+ // put sign back
+ out0 = _mm_sign_epi16(out0, in0);
+ out8 = _mm_sign_epi16(out8, in8);
+
+ // in = out * Q
+ in0 = _mm_mullo_epi16(out0, q0);
+ in8 = _mm_mullo_epi16(out8, q8);
+
+ _mm_storeu_si128((__m128i*)&in[0], in0);
+ _mm_storeu_si128((__m128i*)&in[8], in8);
+
+ // zigzag the output before storing it. The re-ordering is:
+ // 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15
+ // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
+ // There's only two misplaced entries ([8] and [7]) that are crossing the
+ // reg's boundaries.
+ // We use pshufb instead of pshuflo/pshufhi.
+ {
+ const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
+ const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
+ const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
+ const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7
+ const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
+ const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
+ const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
+ const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8
+ const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
+ const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
+ _mm_storeu_si128((__m128i*)&out[0], out_z0);
+ _mm_storeu_si128((__m128i*)&out[8], out_z8);
+ packed_out = _mm_packs_epi16(out_z0, out_z8);
+ }
+
+ // detect if all 'out' values are zeroes or not
+ return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+#undef PSHUFB_CST
+
+static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
+}
+
+static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
+ int nz;
+ const uint16_t* const sharpen = &mtx->sharpen_[0];
+ nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+ nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+ return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitSSE41(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
+ VP8CollectHistogram = CollectHistogram_SSE41;
+ VP8EncQuantizeBlock = QuantizeBlock_SSE41;
+ VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
+ VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
+ VP8TDisto4x4 = Disto4x4_SSE41;
+ VP8TDisto16x16 = Disto16x16_SSE41;
+}
+
+#else // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)
+
+#endif // WEBP_USE_SSE41
diff --git a/media/libwebp/dsp/filters.c b/media/libwebp/dsp/filters.c
index dea3eb4101..b0c659478f 100644
--- a/media/libwebp/dsp/filters.c
+++ b/media/libwebp/dsp/filters.c
@@ -33,9 +33,9 @@ static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
int i;
if (inverse) {
- for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
+ for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] + pred[i]);
} else {
- for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+ for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
}
}
@@ -155,7 +155,7 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
const int pred = GradientPredictor_C(preds[w - 1],
preds[w - stride],
preds[w - stride - 1]);
- out[w] = in[w] + (inverse ? pred : -pred);
+ out[w] = (uint8_t)(in[w] + (inverse ? pred : -pred));
}
++row;
preds += stride;
@@ -194,7 +194,7 @@ static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t pred = (prev == NULL) ? 0 : prev[0];
int i;
for (i = 0; i < width; ++i) {
- out[i] = pred + in[i];
+ out[i] = (uint8_t)(pred + in[i]);
pred = out[i];
}
}
@@ -206,7 +206,7 @@ static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
HorizontalUnfilter_C(NULL, in, out, width);
} else {
int i;
- for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
+ for (i = 0; i < width; ++i) out[i] = (uint8_t)(prev[i] + in[i]);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
@@ -220,7 +220,7 @@ static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
int i;
for (i = 0; i < width; ++i) {
top = prev[i]; // need to read this first, in case prev==out
- left = in[i] + GradientPredictor_C(left, top, top_left);
+ left = (uint8_t)(in[i] + GradientPredictor_C(left, top, top_left));
top_left = top;
out[i] = left;
}
@@ -254,7 +254,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
#endif
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8FiltersInitSSE2();
}
@@ -271,7 +271,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8FiltersInitNEON();
diff --git a/media/libwebp/dsp/filters_mips_dsp_r2.c b/media/libwebp/dsp/filters_mips_dsp_r2.c
new file mode 100644
index 0000000000..edb1eaac26
--- /dev/null
+++ b/media/libwebp/dsp/filters_mips_dsp_r2.c
@@ -0,0 +1,402 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+// Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/dsp.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out) \
+ assert(in != NULL); \
+ assert(out != NULL); \
+ assert(width > 0); \
+ assert(height > 0); \
+ assert(stride >= width); \
+ assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
+ (void)height; // Silence unused warning.
+
+#define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do { \
+ const uint8_t* psrc = (uint8_t*)(SRC); \
+ uint8_t* pdst = (uint8_t*)(DST); \
+ const int ilength = (int)(LENGTH); \
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6; \
+ __asm__ volatile ( \
+ ".set push \n\t" \
+ ".set noreorder \n\t" \
+ "srl %[temp0], %[length], 2 \n\t" \
+ "beqz %[temp0], 4f \n\t" \
+ " andi %[temp6], %[length], 3 \n\t" \
+ ".if " #INVERSE " \n\t" \
+ "1: \n\t" \
+ "lbu %[temp1], -1(%[dst]) \n\t" \
+ "lbu %[temp2], 0(%[src]) \n\t" \
+ "lbu %[temp3], 1(%[src]) \n\t" \
+ "lbu %[temp4], 2(%[src]) \n\t" \
+ "lbu %[temp5], 3(%[src]) \n\t" \
+ "addu %[temp1], %[temp1], %[temp2] \n\t" \
+ "addu %[temp2], %[temp1], %[temp3] \n\t" \
+ "addu %[temp3], %[temp2], %[temp4] \n\t" \
+ "addu %[temp4], %[temp3], %[temp5] \n\t" \
+ "sb %[temp1], 0(%[dst]) \n\t" \
+ "sb %[temp2], 1(%[dst]) \n\t" \
+ "sb %[temp3], 2(%[dst]) \n\t" \
+ "sb %[temp4], 3(%[dst]) \n\t" \
+ "addiu %[src], %[src], 4 \n\t" \
+ "addiu %[temp0], %[temp0], -1 \n\t" \
+ "bnez %[temp0], 1b \n\t" \
+ " addiu %[dst], %[dst], 4 \n\t" \
+ ".else \n\t" \
+ "1: \n\t" \
+ "ulw %[temp1], -1(%[src]) \n\t" \
+ "ulw %[temp2], 0(%[src]) \n\t" \
+ "addiu %[src], %[src], 4 \n\t" \
+ "addiu %[temp0], %[temp0], -1 \n\t" \
+ "subu.qb %[temp3], %[temp2], %[temp1] \n\t" \
+ "usw %[temp3], 0(%[dst]) \n\t" \
+ "bnez %[temp0], 1b \n\t" \
+ " addiu %[dst], %[dst], 4 \n\t" \
+ ".endif \n\t" \
+ "4: \n\t" \
+ "beqz %[temp6], 3f \n\t" \
+ " nop \n\t" \
+ "2: \n\t" \
+ "lbu %[temp2], 0(%[src]) \n\t" \
+ ".if " #INVERSE " \n\t" \
+ "lbu %[temp1], -1(%[dst]) \n\t" \
+ "addu %[temp3], %[temp1], %[temp2] \n\t" \
+ ".else \n\t" \
+ "lbu %[temp1], -1(%[src]) \n\t" \
+ "subu %[temp3], %[temp1], %[temp2] \n\t" \
+ ".endif \n\t" \
+ "addiu %[src], %[src], 1 \n\t" \
+ "sb %[temp3], 0(%[dst]) \n\t" \
+ "addiu %[temp6], %[temp6], -1 \n\t" \
+ "bnez %[temp6], 2b \n\t" \
+ " addiu %[dst], %[dst], 1 \n\t" \
+ "3: \n\t" \
+ ".set pop \n\t" \
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
+ [temp6]"=&r"(temp6), [dst]"+&r"(pdst), [src]"+&r"(psrc) \
+ : [length]"r"(ilength) \
+ : "memory" \
+ ); \
+ } while (0)
+
+static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
+ int length) {
+ DO_PREDICT_LINE(src, dst, length, 0);
+}
+
+#define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do { \
+ const uint8_t* psrc = (uint8_t*)(SRC); \
+ const uint8_t* ppred = (uint8_t*)(PRED); \
+ uint8_t* pdst = (uint8_t*)(DST); \
+ const int ilength = (int)(LENGTH); \
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
+ __asm__ volatile ( \
+ ".set push \n\t" \
+ ".set noreorder \n\t" \
+ "srl %[temp0], %[length], 0x3 \n\t" \
+ "beqz %[temp0], 4f \n\t" \
+ " andi %[temp7], %[length], 0x7 \n\t" \
+ "1: \n\t" \
+ "ulw %[temp1], 0(%[src]) \n\t" \
+ "ulw %[temp2], 0(%[pred]) \n\t" \
+ "ulw %[temp3], 4(%[src]) \n\t" \
+ "ulw %[temp4], 4(%[pred]) \n\t" \
+ "addiu %[src], %[src], 8 \n\t" \
+ ".if " #INVERSE " \n\t" \
+ "addu.qb %[temp5], %[temp1], %[temp2] \n\t" \
+ "addu.qb %[temp6], %[temp3], %[temp4] \n\t" \
+ ".else \n\t" \
+ "subu.qb %[temp5], %[temp1], %[temp2] \n\t" \
+ "subu.qb %[temp6], %[temp3], %[temp4] \n\t" \
+ ".endif \n\t" \
+ "addiu %[pred], %[pred], 8 \n\t" \
+ "usw %[temp5], 0(%[dst]) \n\t" \
+ "usw %[temp6], 4(%[dst]) \n\t" \
+ "addiu %[temp0], %[temp0], -1 \n\t" \
+ "bnez %[temp0], 1b \n\t" \
+ " addiu %[dst], %[dst], 8 \n\t" \
+ "4: \n\t" \
+ "beqz %[temp7], 3f \n\t" \
+ " nop \n\t" \
+ "2: \n\t" \
+ "lbu %[temp1], 0(%[src]) \n\t" \
+ "lbu %[temp2], 0(%[pred]) \n\t" \
+ "addiu %[src], %[src], 1 \n\t" \
+ "addiu %[pred], %[pred], 1 \n\t" \
+ ".if " #INVERSE " \n\t" \
+ "addu %[temp3], %[temp1], %[temp2] \n\t" \
+ ".else \n\t" \
+ "subu %[temp3], %[temp1], %[temp2] \n\t" \
+ ".endif \n\t" \
+ "sb %[temp3], 0(%[dst]) \n\t" \
+ "addiu %[temp7], %[temp7], -1 \n\t" \
+ "bnez %[temp7], 2b \n\t" \
+ " addiu %[dst], %[dst], 1 \n\t" \
+ "3: \n\t" \
+ ".set pop \n\t" \
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [pred]"+&r"(ppred), \
+ [dst]"+&r"(pdst), [src]"+&r"(psrc) \
+ : [length]"r"(ilength) \
+ : "memory" \
+ ); \
+ } while (0)
+
+#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST) do { \
+ int temp1, temp2, temp3; \
+ __asm__ volatile ( \
+ "lbu %[temp1], 0(%[src]) \n\t" \
+ "lbu %[temp2], 0(%[pred]) \n\t" \
+ "subu %[temp3], %[temp1], %[temp2] \n\t" \
+ "sb %[temp3], 0(%[dst]) \n\t" \
+ : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
+ : [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC)) \
+ : "memory" \
+ ); \
+ } while (0)
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+#define FILTER_LINE_BY_LINE do { \
+ while (row < last_row) { \
+ PREDICT_LINE_ONE_PASS(in, preds - stride, out); \
+ DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0); \
+ ++row; \
+ preds += stride; \
+ in += stride; \
+ out += stride; \
+ } \
+ } while (0)
+
+static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
+ int width, int height,
+ int stride,
+ int row, int num_rows,
+ uint8_t* out) {
+ const uint8_t* preds;
+ const size_t start_offset = row * stride;
+ const int last_row = row + num_rows;
+ SANITY_CHECK(in, out);
+ in += start_offset;
+ out += start_offset;
+ preds = in;
+
+ if (row == 0) {
+ // Leftmost pixel is the same as input for topmost scanline.
+ out[0] = in[0];
+ PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+ row = 1;
+ preds += stride;
+ in += stride;
+ out += stride;
+ }
+
+ // Filter line-by-line.
+ FILTER_LINE_BY_LINE;
+}
+#undef FILTER_LINE_BY_LINE
+
+static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
+ int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+ filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+#define FILTER_LINE_BY_LINE do { \
+ while (row < last_row) { \
+ DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0); \
+ ++row; \
+ preds += stride; \
+ in += stride; \
+ out += stride; \
+ } \
+ } while (0)
+
+static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
+ int width, int height,
+ int stride,
+ int row, int num_rows,
+ uint8_t* out) {
+ const uint8_t* preds;
+ const size_t start_offset = row * stride;
+ const int last_row = row + num_rows;
+ SANITY_CHECK(in, out);
+ in += start_offset;
+ out += start_offset;
+ preds = in;
+
+ if (row == 0) {
+ // Very first top-left pixel is copied.
+ out[0] = in[0];
+ // Rest of top scan-line is left-predicted.
+ PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+ row = 1;
+ in += stride;
+ out += stride;
+ } else {
+ // We are starting from in-between. Make sure 'preds' points to prev row.
+ preds -= stride;
+ }
+
+ // Filter line-by-line.
+ FILTER_LINE_BY_LINE;
+}
+#undef FILTER_LINE_BY_LINE
+
+static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+ filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
+ int temp0;
+ __asm__ volatile (
+ "addu %[temp0], %[a], %[b] \n\t"
+ "subu %[temp0], %[temp0], %[c] \n\t"
+ "shll_s.w %[temp0], %[temp0], 23 \n\t"
+ "precrqu_s.qb.ph %[temp0], %[temp0], $zero \n\t"
+ "srl %[temp0], %[temp0], 24 \n\t"
+ : [temp0]"=&r"(temp0)
+ : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+ );
+ return temp0;
+}
+
+#define FILTER_LINE_BY_LINE(PREDS, OPERATION) do { \
+ while (row < last_row) { \
+ int w; \
+ PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \
+ for (w = 1; w < width; ++w) { \
+ const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1], \
+ PREDS[w - stride], \
+ PREDS[w - stride - 1]); \
+ out[w] = in[w] OPERATION pred; \
+ } \
+ ++row; \
+ in += stride; \
+ out += stride; \
+ } \
+ } while (0)
+
+static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
+ int width, int height, int stride,
+ int row, int num_rows, uint8_t* out) {
+ const uint8_t* preds;
+ const size_t start_offset = row * stride;
+ const int last_row = row + num_rows;
+ SANITY_CHECK(in, out);
+ in += start_offset;
+ out += start_offset;
+ preds = in;
+
+ // left prediction for top scan-line
+ if (row == 0) {
+ out[0] = in[0];
+ PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+ row = 1;
+ preds += stride;
+ in += stride;
+ out += stride;
+ }
+
+ // Filter line-by-line.
+ FILTER_LINE_BY_LINE(in, -);
+}
+#undef FILTER_LINE_BY_LINE
+
+static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
+ filtered_data);
+}
+
+//------------------------------------------------------------------------------
+
+static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
+ out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+ DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
+}
+
+static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
+ if (prev == NULL) {
+ HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
+ } else {
+ DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
+ }
+}
+
+static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
+ if (prev == NULL) {
+ HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
+ } else {
+ uint8_t top = prev[0], top_left = top, left = top;
+ int i;
+ for (i = 0; i < width; ++i) {
+ top = prev[i]; // need to read this first, in case prev==dst
+ left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
+ top_left = top;
+ out[i] = left;
+ }
+ }
+}
+
+#undef DO_PREDICT_LINE_VERTICAL
+#undef PREDICT_LINE_ONE_PASS
+#undef DO_PREDICT_LINE
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
+ WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
+ WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
+ WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
+
+ WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
+ WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
+ WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/filters_msa.c b/media/libwebp/dsp/filters_msa.c
new file mode 100644
index 0000000000..cd32cdabaf
--- /dev/null
+++ b/media/libwebp/dsp/filters_msa.c
@@ -0,0 +1,202 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of alpha filters
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/msa_macro.h"
+
+#include <assert.h>
+
+static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
+ const uint8_t* pred,
+ uint8_t* dst, int length) {
+ v16u8 src0, pred0, dst0;
+ assert(length >= 0);
+ while (length >= 32) {
+ v16u8 src1, pred1, dst1;
+ LD_UB2(src, 16, src0, src1);
+ LD_UB2(pred, 16, pred0, pred1);
+ SUB2(src0, pred0, src1, pred1, dst0, dst1);
+ ST_UB2(dst0, dst1, dst, 16);
+ src += 32;
+ pred += 32;
+ dst += 32;
+ length -= 32;
+ }
+ if (length > 0) {
+ int i;
+ if (length >= 16) {
+ src0 = LD_UB(src);
+ pred0 = LD_UB(pred);
+ dst0 = src0 - pred0;
+ ST_UB(dst0, dst);
+ src += 16;
+ pred += 16;
+ dst += 16;
+ length -= 16;
+ }
+ for (i = 0; i < length; i++) {
+ dst[i] = src[i] - pred[i];
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+#define SANITY_CHECK(in, out) \
+ assert(in != NULL); \
+ assert(out != NULL); \
+ assert(width > 0); \
+ assert(height > 0); \
+ assert(stride >= width);
+
+//------------------------------------------------------------------------------
+// Horrizontal filter
+
+static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ const uint8_t* preds = data;
+ const uint8_t* in = data;
+ uint8_t* out = filtered_data;
+ int row = 1;
+ SANITY_CHECK(in, out);
+
+ // Leftmost pixel is the same as input for topmost scanline.
+ out[0] = in[0];
+ PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+ preds += stride;
+ in += stride;
+ out += stride;
+ // Filter line-by-line.
+ while (row < height) {
+ // Leftmost pixel is predicted from above.
+ PredictLineInverse0(in, preds - stride, out, 1);
+ PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+ ++row;
+ preds += stride;
+ in += stride;
+ out += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter
+
+static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
+ const uint8_t* ppred,
+ uint8_t* poutput, int stride,
+ int size) {
+ int w;
+ const v16i8 zero = { 0 };
+ while (size >= 16) {
+ v16u8 pred0, dst0;
+ v8i16 a0, a1, b0, b1, c0, c1;
+ const v16u8 tmp0 = LD_UB(ppred - 1);
+ const v16u8 tmp1 = LD_UB(ppred - stride);
+ const v16u8 tmp2 = LD_UB(ppred - stride - 1);
+ const v16u8 src0 = LD_UB(pinput);
+ ILVRL_B2_SH(zero, tmp0, a0, a1);
+ ILVRL_B2_SH(zero, tmp1, b0, b1);
+ ILVRL_B2_SH(zero, tmp2, c0, c1);
+ ADD2(a0, b0, a1, b1, a0, a1);
+ SUB2(a0, c0, a1, c1, a0, a1);
+ CLIP_SH2_0_255(a0, a1);
+ pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);
+ dst0 = src0 - pred0;
+ ST_UB(dst0, poutput);
+ ppred += 16;
+ pinput += 16;
+ poutput += 16;
+ size -= 16;
+ }
+ for (w = 0; w < size; ++w) {
+ const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1];
+ poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred);
+ }
+}
+
+
+static void GradientFilter_MSA(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ const uint8_t* in = data;
+ const uint8_t* preds = data;
+ uint8_t* out = filtered_data;
+ int row = 1;
+ SANITY_CHECK(in, out);
+
+ // left prediction for top scan-line
+ out[0] = in[0];
+ PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+ preds += stride;
+ in += stride;
+ out += stride;
+ // Filter line-by-line.
+ while (row < height) {
+ out[0] = in[0] - preds[- stride];
+ PredictLineGradient(preds + 1, in + 1, out + 1, stride, width - 1);
+ ++row;
+ preds += stride;
+ in += stride;
+ out += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter
+
+static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ const uint8_t* in = data;
+ const uint8_t* preds = data;
+ uint8_t* out = filtered_data;
+ int row = 1;
+ SANITY_CHECK(in, out);
+
+ // Very first top-left pixel is copied.
+ out[0] = in[0];
+ // Rest of top scan-line is left-predicted.
+ PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+ in += stride;
+ out += stride;
+
+ // Filter line-by-line.
+ while (row < height) {
+ PredictLineInverse0(in, preds, out, width);
+ ++row;
+ preds += stride;
+ in += stride;
+ out += stride;
+ }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
+ WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
+ WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
+ WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
+}
+
+#else // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMSA)
+
+#endif // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/filters_sse2.c b/media/libwebp/dsp/filters_sse2.c
index 2cc9bb9766..9b91ab680f 100644
--- a/media/libwebp/dsp/filters_sse2.c
+++ b/media/libwebp/dsp/filters_sse2.c
@@ -163,7 +163,8 @@ static void GradientPredictDirect_SSE2(const uint8_t* const row,
_mm_storel_epi64((__m128i*)(out + i), H);
}
for (; i < length; ++i) {
- out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+ const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+ out[i] = (uint8_t)(row[i] - delta);
}
}
@@ -188,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
- out[0] = in[0] - in[-stride];
+ out[0] = (uint8_t)(in[0] - in[-stride]);
GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride;
@@ -223,7 +224,7 @@ static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
int i;
__m128i last;
- out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+ out[0] = (uint8_t)(in[0] + (prev == NULL ? 0 : prev[0]));
if (width <= 1) return;
last = _mm_set_epi32(0, 0, 0, out[0]);
for (i = 1; i + 8 <= width; i += 8) {
@@ -238,7 +239,7 @@ static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
_mm_storel_epi64((__m128i*)(out + i), A7);
last = _mm_srli_epi64(A7, 56);
}
- for (; i < width; ++i) out[i] = in[i] + out[i - 1];
+ for (; i < width; ++i) out[i] = (uint8_t)(in[i] + out[i - 1]);
}
static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
@@ -259,7 +260,7 @@ static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
_mm_storeu_si128((__m128i*)&out[i + 0], C0);
_mm_storeu_si128((__m128i*)&out[i + 16], C1);
}
- for (; i < width; ++i) out[i] = in[i] + prev[i];
+ for (; i < width; ++i) out[i] = (uint8_t)(in[i] + prev[i]);
}
}
@@ -296,7 +297,8 @@ static void GradientPredictInverse_SSE2(const uint8_t* const in,
_mm_storel_epi64((__m128i*)&row[i], out);
}
for (; i < length; ++i) {
- row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+ const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+ row[i] = (uint8_t)(in[i] + delta);
}
}
}
@@ -306,7 +308,7 @@ static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
if (prev == NULL) {
HorizontalUnfilter_SSE2(NULL, in, out, width);
} else {
- out[0] = in[0] + prev[0]; // predict from above
+ out[0] = (uint8_t)(in[0] + prev[0]); // predict from above
GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
}
}
@@ -318,7 +320,12 @@ extern void VP8FiltersInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+#if defined(CHROMIUM)
+ // TODO(crbug.com/654974)
+ (void)VerticalUnfilter_SSE2;
+#else
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+#endif
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
diff --git a/media/libwebp/dsp/lossless.c b/media/libwebp/dsp/lossless.c
index 1a1523d221..763c425ff8 100644
--- a/media/libwebp/dsp/lossless.c
+++ b/media/libwebp/dsp/lossless.c
@@ -81,7 +81,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
// inlined.
-#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
+#if defined(__arm__) && defined(__GNUC__) && LOCAL_GCC_VERSION <= 0x409
# define LOCAL_INLINE __attribute__ ((noinline))
#else
# define LOCAL_INLINE WEBP_INLINE
@@ -107,88 +107,107 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
//------------------------------------------------------------------------------
// Predictors
-static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+ const uint32_t* const top) {
(void)top;
(void)left;
return ARGB_BLACK;
}
-static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+ const uint32_t* const top) {
(void)top;
- return left;
+ return *left;
}
-static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+ const uint32_t* const top) {
(void)left;
return top[0];
}
-static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+ const uint32_t* const top) {
(void)left;
return top[1];
}
-static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+ const uint32_t* const top) {
(void)left;
return top[-1];
}
-static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average3(left, top[0], top[1]);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Average3(*left, top[0], top[1]);
return pred;
}
-static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average2(left, top[-1]);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Average2(*left, top[-1]);
return pred;
}
-static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average2(left, top[0]);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Average2(*left, top[0]);
return pred;
}
-static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+ const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
(void)left;
return pred;
}
-static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+ const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
(void)left;
return pred;
}
-static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Average4(*left, top[-1], top[0], top[1]);
return pred;
}
-static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Select(top[0], left, top[-1]);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Select(top[0], *left, top[-1]);
return pred;
}
-static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = ClampedAddSubtractFull(*left, top[0], top[-1]);
return pred;
}
-static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = ClampedAddSubtractHalf(*left, top[0], top[-1]);
return pred;
}
-GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
+static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
+ int x;
+ (void)upper;
+ for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
+}
static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint32_t left = out[-1];
+ (void)upper;
for (i = 0; i < num_pixels; ++i) {
out[i] = left = VP8LAddPixels(in[i], left);
}
- (void)upper;
}
-GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
-GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
-GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
-GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
-GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
-GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
-GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
-GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
-GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
-GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
-GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
-GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C)
//------------------------------------------------------------------------------
@@ -270,14 +289,14 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = src[i];
- const uint32_t green = argb >> 8;
+ const int8_t green = (int8_t)(argb >> 8);
const uint32_t red = argb >> 16;
int new_red = red & 0xff;
int new_blue = argb & 0xff;
new_red += ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue += ColorTransformDelta(m->green_to_blue_, green);
- new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
+ new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red);
new_blue &= 0xff;
dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
@@ -557,7 +576,6 @@ VP8LPredictorFunc VP8LPredictors[16];
// exposed plain-C implementations
VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
-VP8LPredictorFunc VP8LPredictors_C[16];
VP8LTransformColorInverseFunc VP8LTransformColorInverse;
@@ -571,6 +589,7 @@ VP8LMapARGBFunc VP8LMapColor32b;
VP8LMapAlphaFunc VP8LMapColor8b;
extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitSSE41(void);
extern void VP8LDspInitNEON(void);
extern void VP8LDspInitMIPSdspR2(void);
extern void VP8LDspInitMSA(void);
@@ -595,8 +614,7 @@ extern void VP8LDspInitMSA(void);
} while (0);
WEBP_DSP_INIT_FUNC(VP8LDspInit) {
- COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
- COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
+ COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
@@ -618,9 +636,14 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8LDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+ if (VP8GetCPUInfo(kSSE4_1)) {
+ VP8LDspInitSSE41();
+ }
+#endif
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
@@ -635,7 +658,7 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LDspInitNEON();
diff --git a/media/libwebp/dsp/lossless.h b/media/libwebp/dsp/lossless.h
index 6db5fafc13..0c129d2860 100644
--- a/media/libwebp/dsp/lossless.h
+++ b/media/libwebp/dsp/lossless.h
@@ -28,9 +28,39 @@ extern "C" {
//------------------------------------------------------------------------------
// Decoding
-typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
+ const uint32_t* const top);
extern VP8LPredictorFunc VP8LPredictors[16];
-extern VP8LPredictorFunc VP8LPredictors_C[16];
+
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+ const uint32_t* const top);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+ const uint32_t* const top);
+
// These Add/Sub function expects upper[-1] and out[-1] to be readable.
typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
const uint32_t* upper, int num_pixels,
diff --git a/media/libwebp/dsp/lossless_common.h b/media/libwebp/dsp/lossless_common.h
index dd2e4f247e..2b20637a28 100644
--- a/media/libwebp/dsp/lossless_common.h
+++ b/media/libwebp/dsp/lossless_common.h
@@ -177,24 +177,13 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int x; \
+ assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \
- const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x); \
+ const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x); \
out[x] = VP8LAddPixels(in[x], pred); \
} \
}
-// It subtracts the prediction from the input pixel and stores the residual
-// in the output pixel.
-#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB) \
-static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
- int num_pixels, uint32_t* out) { \
- int x; \
- for (x = 0; x < num_pixels; ++x) { \
- const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x); \
- out[x] = VP8LSubPixels(in[x], pred); \
- } \
-}
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/media/libwebp/dsp/lossless_enc.c b/media/libwebp/dsp/lossless_enc.c
new file mode 100644
index 0000000000..dca5e26be8
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc.c
@@ -0,0 +1,948 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+// Jyrki Alakuijala (jyrki@google.com)
+// Urvang Joshi (urvang@google.com)
+
+#include "../dsp/dsp.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include "../dec/vp8li_dec.h"
+#include "../utils/endian_inl_utils.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../dsp/yuv.h"
+
+// lookup table for small values of log2(int)
+const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
+ 0.0000000000000000f, 0.0000000000000000f,
+ 1.0000000000000000f, 1.5849625007211560f,
+ 2.0000000000000000f, 2.3219280948873621f,
+ 2.5849625007211560f, 2.8073549220576041f,
+ 3.0000000000000000f, 3.1699250014423121f,
+ 3.3219280948873621f, 3.4594316186372973f,
+ 3.5849625007211560f, 3.7004397181410921f,
+ 3.8073549220576041f, 3.9068905956085187f,
+ 4.0000000000000000f, 4.0874628412503390f,
+ 4.1699250014423121f, 4.2479275134435852f,
+ 4.3219280948873626f, 4.3923174227787606f,
+ 4.4594316186372973f, 4.5235619560570130f,
+ 4.5849625007211560f, 4.6438561897747243f,
+ 4.7004397181410917f, 4.7548875021634682f,
+ 4.8073549220576037f, 4.8579809951275718f,
+ 4.9068905956085187f, 4.9541963103868749f,
+ 5.0000000000000000f, 5.0443941193584533f,
+ 5.0874628412503390f, 5.1292830169449663f,
+ 5.1699250014423121f, 5.2094533656289501f,
+ 5.2479275134435852f, 5.2854022188622487f,
+ 5.3219280948873626f, 5.3575520046180837f,
+ 5.3923174227787606f, 5.4262647547020979f,
+ 5.4594316186372973f, 5.4918530963296747f,
+ 5.5235619560570130f, 5.5545888516776376f,
+ 5.5849625007211560f, 5.6147098441152083f,
+ 5.6438561897747243f, 5.6724253419714951f,
+ 5.7004397181410917f, 5.7279204545631987f,
+ 5.7548875021634682f, 5.7813597135246599f,
+ 5.8073549220576037f, 5.8328900141647412f,
+ 5.8579809951275718f, 5.8826430493618415f,
+ 5.9068905956085187f, 5.9307373375628866f,
+ 5.9541963103868749f, 5.9772799234999167f,
+ 6.0000000000000000f, 6.0223678130284543f,
+ 6.0443941193584533f, 6.0660891904577720f,
+ 6.0874628412503390f, 6.1085244567781691f,
+ 6.1292830169449663f, 6.1497471195046822f,
+ 6.1699250014423121f, 6.1898245588800175f,
+ 6.2094533656289501f, 6.2288186904958804f,
+ 6.2479275134435852f, 6.2667865406949010f,
+ 6.2854022188622487f, 6.3037807481771030f,
+ 6.3219280948873626f, 6.3398500028846243f,
+ 6.3575520046180837f, 6.3750394313469245f,
+ 6.3923174227787606f, 6.4093909361377017f,
+ 6.4262647547020979f, 6.4429434958487279f,
+ 6.4594316186372973f, 6.4757334309663976f,
+ 6.4918530963296747f, 6.5077946401986963f,
+ 6.5235619560570130f, 6.5391588111080309f,
+ 6.5545888516776376f, 6.5698556083309478f,
+ 6.5849625007211560f, 6.5999128421871278f,
+ 6.6147098441152083f, 6.6293566200796094f,
+ 6.6438561897747243f, 6.6582114827517946f,
+ 6.6724253419714951f, 6.6865005271832185f,
+ 6.7004397181410917f, 6.7142455176661224f,
+ 6.7279204545631987f, 6.7414669864011464f,
+ 6.7548875021634682f, 6.7681843247769259f,
+ 6.7813597135246599f, 6.7944158663501061f,
+ 6.8073549220576037f, 6.8201789624151878f,
+ 6.8328900141647412f, 6.8454900509443747f,
+ 6.8579809951275718f, 6.8703647195834047f,
+ 6.8826430493618415f, 6.8948177633079437f,
+ 6.9068905956085187f, 6.9188632372745946f,
+ 6.9307373375628866f, 6.9425145053392398f,
+ 6.9541963103868749f, 6.9657842846620869f,
+ 6.9772799234999167f, 6.9886846867721654f,
+ 7.0000000000000000f, 7.0112272554232539f,
+ 7.0223678130284543f, 7.0334230015374501f,
+ 7.0443941193584533f, 7.0552824355011898f,
+ 7.0660891904577720f, 7.0768155970508308f,
+ 7.0874628412503390f, 7.0980320829605263f,
+ 7.1085244567781691f, 7.1189410727235076f,
+ 7.1292830169449663f, 7.1395513523987936f,
+ 7.1497471195046822f, 7.1598713367783890f,
+ 7.1699250014423121f, 7.1799090900149344f,
+ 7.1898245588800175f, 7.1996723448363644f,
+ 7.2094533656289501f, 7.2191685204621611f,
+ 7.2288186904958804f, 7.2384047393250785f,
+ 7.2479275134435852f, 7.2573878426926521f,
+ 7.2667865406949010f, 7.2761244052742375f,
+ 7.2854022188622487f, 7.2946207488916270f,
+ 7.3037807481771030f, 7.3128829552843557f,
+ 7.3219280948873626f, 7.3309168781146167f,
+ 7.3398500028846243f, 7.3487281542310771f,
+ 7.3575520046180837f, 7.3663222142458160f,
+ 7.3750394313469245f, 7.3837042924740519f,
+ 7.3923174227787606f, 7.4008794362821843f,
+ 7.4093909361377017f, 7.4178525148858982f,
+ 7.4262647547020979f, 7.4346282276367245f,
+ 7.4429434958487279f, 7.4512111118323289f,
+ 7.4594316186372973f, 7.4676055500829976f,
+ 7.4757334309663976f, 7.4838157772642563f,
+ 7.4918530963296747f, 7.4998458870832056f,
+ 7.5077946401986963f, 7.5156998382840427f,
+ 7.5235619560570130f, 7.5313814605163118f,
+ 7.5391588111080309f, 7.5468944598876364f,
+ 7.5545888516776376f, 7.5622424242210728f,
+ 7.5698556083309478f, 7.5774288280357486f,
+ 7.5849625007211560f, 7.5924570372680806f,
+ 7.5999128421871278f, 7.6073303137496104f,
+ 7.6147098441152083f, 7.6220518194563764f,
+ 7.6293566200796094f, 7.6366246205436487f,
+ 7.6438561897747243f, 7.6510516911789281f,
+ 7.6582114827517946f, 7.6653359171851764f,
+ 7.6724253419714951f, 7.6794800995054464f,
+ 7.6865005271832185f, 7.6934869574993252f,
+ 7.7004397181410917f, 7.7073591320808825f,
+ 7.7142455176661224f, 7.7210991887071855f,
+ 7.7279204545631987f, 7.7347096202258383f,
+ 7.7414669864011464f, 7.7481928495894605f,
+ 7.7548875021634682f, 7.7615512324444795f,
+ 7.7681843247769259f, 7.7747870596011736f,
+ 7.7813597135246599f, 7.7879025593914317f,
+ 7.7944158663501061f, 7.8008998999203047f,
+ 7.8073549220576037f, 7.8137811912170374f,
+ 7.8201789624151878f, 7.8265484872909150f,
+ 7.8328900141647412f, 7.8392037880969436f,
+ 7.8454900509443747f, 7.8517490414160571f,
+ 7.8579809951275718f, 7.8641861446542797f,
+ 7.8703647195834047f, 7.8765169465649993f,
+ 7.8826430493618415f, 7.8887432488982591f,
+ 7.8948177633079437f, 7.9008668079807486f,
+ 7.9068905956085187f, 7.9128893362299619f,
+ 7.9188632372745946f, 7.9248125036057812f,
+ 7.9307373375628866f, 7.9366379390025709f,
+ 7.9425145053392398f, 7.9483672315846778f,
+ 7.9541963103868749f, 7.9600019320680805f,
+ 7.9657842846620869f, 7.9715435539507719f,
+ 7.9772799234999167f, 7.9829935746943103f,
+ 7.9886846867721654f, 7.9943534368588577f
+};
+
+const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
+ 0.00000000f, 0.00000000f, 2.00000000f, 4.75488750f,
+ 8.00000000f, 11.60964047f, 15.50977500f, 19.65148445f,
+ 24.00000000f, 28.52932501f, 33.21928095f, 38.05374781f,
+ 43.01955001f, 48.10571634f, 53.30296891f, 58.60335893f,
+ 64.00000000f, 69.48686830f, 75.05865003f, 80.71062276f,
+ 86.43856190f, 92.23866588f, 98.10749561f, 104.04192499f,
+ 110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
+ 134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
+ 160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
+ 186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
+ 212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
+ 240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
+ 268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
+ 296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
+ 325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
+ 354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
+ 384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
+ 413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
+ 444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
+ 474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
+ 505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
+ 536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
+ 568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
+ 600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
+ 632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
+ 664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
+ 696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
+ 729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
+ 762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
+ 795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
+ 828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
+ 862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
+ 896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
+ 929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
+ 963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
+ 998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
+ 1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
+ 1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
+ 1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
+ 1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
+ 1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
+ 1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
+ 1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
+ 1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
+ 1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
+ 1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
+ 1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
+ 1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
+ 1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
+ 1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
+ 1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
+ 1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
+ 1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
+ 1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
+ 1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
+ 1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
+ 1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
+ 1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
+ 1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
+ 1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
+ 1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
+ 1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
+ 1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
+ 2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
+};
+
+const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
+ { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
+ { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
+ { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
+ { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
+ { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+ {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+ {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+ {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+ {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+ {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+ {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+ {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+ {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+ {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+ {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+ {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+ {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+ {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+ {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+ {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+ {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+ {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+ {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+ {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+ {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+ {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+ {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+ {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+ {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+ {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+ {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+ {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+ {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+ {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+ {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+};
+
+const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
+ 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+ 127,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
+};
+
+static float FastSLog2Slow_C(uint32_t v) {
+ assert(v >= LOG_LOOKUP_IDX_MAX);
+ if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+ // use clz if available
+ const int log_cnt = BitsLog2Floor(v) - 7;
+ const uint32_t y = 1 << log_cnt;
+ int correction = 0;
+ const float v_f = (float)v;
+ const uint32_t orig_v = v;
+ v >>= log_cnt;
+#else
+ int log_cnt = 0;
+ uint32_t y = 1;
+ int correction = 0;
+ const float v_f = (float)v;
+ const uint32_t orig_v = v;
+ do {
+ ++log_cnt;
+ v = v >> 1;
+ y = y << 1;
+ } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
+ // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+ // Xf = floor(Xf) * (1 + (v % y) / v)
+ // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+ // The correction factor: log(1 + d) ~ d; for very small d values, so
+ // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+ // LOG_2_RECIPROCAL ~ 23/16
+ correction = (23 * (orig_v & (y - 1))) >> 4;
+ return v_f * (kLog2Table[v] + log_cnt) + correction;
+ } else {
+ return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+ }
+}
+
+static float FastLog2Slow_C(uint32_t v) {
+ assert(v >= LOG_LOOKUP_IDX_MAX);
+ if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+ // use clz if available
+ const int log_cnt = BitsLog2Floor(v) - 7;
+ const uint32_t y = 1 << log_cnt;
+ const uint32_t orig_v = v;
+ double log_2;
+ v >>= log_cnt;
+#else
+ int log_cnt = 0;
+ uint32_t y = 1;
+ const uint32_t orig_v = v;
+ double log_2;
+ do {
+ ++log_cnt;
+ v = v >> 1;
+ y = y << 1;
+ } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
+ log_2 = kLog2Table[v] + log_cnt;
+ if (orig_v >= APPROX_LOG_MAX) {
+ // Since the division is still expensive, add this correction factor only
+ // for large values of 'v'.
+ const int correction = (23 * (orig_v & (y - 1))) >> 4;
+ log_2 += (double)correction / orig_v;
+ }
+ return (float)log_2;
+ } else {
+ return (float)(LOG_2_RECIPROCAL * log((double)v));
+ }
+}
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
+static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
+ int i;
+ double retval = 0.;
+ int sumX = 0, sumXY = 0;
+ for (i = 0; i < 256; ++i) {
+ const int x = X[i];
+ if (x != 0) {
+ const int xy = x + Y[i];
+ sumX += x;
+ retval -= VP8LFastSLog2(x);
+ sumXY += xy;
+ retval -= VP8LFastSLog2(xy);
+ } else if (Y[i] != 0) {
+ sumXY += Y[i];
+ retval -= VP8LFastSLog2(Y[i]);
+ }
+ }
+ retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+ return (float)retval;
+}
+
+void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
+ entropy->entropy = 0.;
+ entropy->sum = 0;
+ entropy->nonzeros = 0;
+ entropy->max_val = 0;
+ entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
+}
+
+void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
+ VP8LBitEntropy* const entropy) {
+ int i;
+
+ VP8LBitEntropyInit(entropy);
+
+ for (i = 0; i < n; ++i) {
+ if (array[i] != 0) {
+ entropy->sum += array[i];
+ entropy->nonzero_code = i;
+ ++entropy->nonzeros;
+ entropy->entropy -= VP8LFastSLog2(array[i]);
+ if (entropy->max_val < array[i]) {
+ entropy->max_val = array[i];
+ }
+ }
+ }
+ entropy->entropy += VP8LFastSLog2(entropy->sum);
+}
+
+static WEBP_INLINE void GetEntropyUnrefinedHelper(
+ uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+ VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+ const int streak = i - *i_prev;
+
+ // Gather info for the bit entropy.
+ if (*val_prev != 0) {
+ bit_entropy->sum += (*val_prev) * streak;
+ bit_entropy->nonzeros += streak;
+ bit_entropy->nonzero_code = *i_prev;
+ bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+ if (bit_entropy->max_val < *val_prev) {
+ bit_entropy->max_val = *val_prev;
+ }
+ }
+
+ // Gather info for the Huffman cost.
+ stats->counts[*val_prev != 0] += (streak > 3);
+ stats->streaks[*val_prev != 0][(streak > 3)] += streak;
+
+ *val_prev = val;
+ *i_prev = i;
+}
+
+static void GetEntropyUnrefined_C(const uint32_t X[], int length,
+ VP8LBitEntropy* const bit_entropy,
+ VP8LStreaks* const stats) {
+ int i;
+ int i_prev = 0;
+ uint32_t x_prev = X[0];
+
+ memset(stats, 0, sizeof(*stats));
+ VP8LBitEntropyInit(bit_entropy);
+
+ for (i = 1; i < length; ++i) {
+ const uint32_t x = X[i];
+ if (x != x_prev) {
+ GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+ }
+ }
+ GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+
+ bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
+ const uint32_t Y[],
+ int length,
+ VP8LBitEntropy* const bit_entropy,
+ VP8LStreaks* const stats) {
+ int i = 1;
+ int i_prev = 0;
+ uint32_t xy_prev = X[0] + Y[0];
+
+ memset(stats, 0, sizeof(*stats));
+ VP8LBitEntropyInit(bit_entropy);
+
+ for (i = 1; i < length; ++i) {
+ const uint32_t xy = X[i] + Y[i];
+ if (xy != xy_prev) {
+ GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
+ }
+ }
+ GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+
+ bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+//------------------------------------------------------------------------------
+
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
+ int i;
+ for (i = 0; i < num_pixels; ++i) {
+ const int argb = argb_data[i];
+ const int green = (argb >> 8) & 0xff;
+ const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
+ const uint32_t new_b = (((argb >> 0) & 0xff) - green) & 0xff;
+ argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
+ }
+}
+
+static WEBP_INLINE int ColorTransformDelta(int8_t color_pred, int8_t color) {
+ return ((int)color_pred * color) >> 5;
+}
+
+static WEBP_INLINE int8_t U32ToS8(uint32_t v) {
+ return (int8_t)(v & 0xff);
+}
+
+void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
+ int num_pixels) {
+ int i;
+ for (i = 0; i < num_pixels; ++i) {
+ const uint32_t argb = data[i];
+ const int8_t green = U32ToS8(argb >> 8);
+ const int8_t red = U32ToS8(argb >> 16);
+ int new_red = red & 0xff;
+ int new_blue = argb & 0xff;
+ new_red -= ColorTransformDelta(m->green_to_red_, green);
+ new_red &= 0xff;
+ new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+ new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+ new_blue &= 0xff;
+ data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+ }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+ uint32_t argb) {
+ const int8_t green = U32ToS8(argb >> 8);
+ int new_red = argb >> 16;
+ new_red -= ColorTransformDelta(green_to_red, green);
+ return (new_red & 0xff);
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+ uint8_t red_to_blue,
+ uint32_t argb) {
+ const int8_t green = U32ToS8(argb >> 8);
+ const int8_t red = U32ToS8(argb >> 16);
+ uint8_t new_blue = argb & 0xff;
+ new_blue -= ColorTransformDelta(green_to_blue, green);
+ new_blue -= ColorTransformDelta(red_to_blue, red);
+ return (new_blue & 0xff);
+}
+
+void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_red, int histo[]) {
+ while (tile_height-- > 0) {
+ int x;
+ for (x = 0; x < tile_width; ++x) {
+ ++histo[TransformColorRed((uint8_t)green_to_red, argb[x])];
+ }
+ argb += stride;
+ }
+}
+
+void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_blue, int red_to_blue,
+ int histo[]) {
+ while (tile_height-- > 0) {
+ int x;
+ for (x = 0; x < tile_width; ++x) {
+ ++histo[TransformColorBlue((uint8_t)green_to_blue, (uint8_t)red_to_blue,
+ argb[x])];
+ }
+ argb += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+
+static int VectorMismatch_C(const uint32_t* const array1,
+ const uint32_t* const array2, int length) {
+ int match_len = 0;
+
+ while (match_len < length && array1[match_len] == array2[match_len]) {
+ ++match_len;
+ }
+ return match_len;
+}
+
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
+ uint32_t* dst) {
+ int x;
+ if (xbits > 0) {
+ const int bit_depth = 1 << (3 - xbits);
+ const int mask = (1 << xbits) - 1;
+ uint32_t code = 0xff000000;
+ for (x = 0; x < width; ++x) {
+ const int xsub = x & mask;
+ if (xsub == 0) {
+ code = 0xff000000;
+ }
+ code |= row[x] << (8 + bit_depth * xsub);
+ dst[x >> xbits] = code;
+ }
+ } else {
+ for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
+ }
+}
+
+//------------------------------------------------------------------------------
+
+static double ExtraCost_C(const uint32_t* population, int length) {
+ int i;
+ double cost = 0.;
+ for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
+ return cost;
+}
+
+static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+ int length) {
+ int i;
+ double cost = 0.;
+ for (i = 2; i < length - 2; ++i) {
+ const int xy = X[i + 2] + Y[i + 2];
+ cost += (i >> 1) * xy;
+ }
+ return cost;
+}
+
+//------------------------------------------------------------------------------
+
+static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out,
+ int size) {
+ int i;
+ for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
+}
+
+static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
+ int i;
+ for (i = 0; i < size; ++i) out[i] += a[i];
+}
+
+#define ADD(X, ARG, LEN) do { \
+ if (a->is_used_[X]) { \
+ if (b->is_used_[X]) { \
+ VP8LAddVector(a->ARG, b->ARG, out->ARG, (LEN)); \
+ } else { \
+ memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \
+ } \
+ } else if (b->is_used_[X]) { \
+ memcpy(&out->ARG[0], &b->ARG[0], (LEN) * sizeof(out->ARG[0])); \
+ } else { \
+ memset(&out->ARG[0], 0, (LEN) * sizeof(out->ARG[0])); \
+ } \
+} while (0)
+
+#define ADD_EQ(X, ARG, LEN) do { \
+ if (a->is_used_[X]) { \
+ if (out->is_used_[X]) { \
+ VP8LAddVectorEq(a->ARG, out->ARG, (LEN)); \
+ } else { \
+ memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \
+ } \
+ } \
+} while (0)
+
+void VP8LHistogramAdd(const VP8LHistogram* const a,
+ const VP8LHistogram* const b, VP8LHistogram* const out) {
+ int i;
+ const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+ assert(a->palette_code_bits_ == b->palette_code_bits_);
+
+ if (b != out) {
+ ADD(0, literal_, literal_size);
+ ADD(1, red_, NUM_LITERAL_CODES);
+ ADD(2, blue_, NUM_LITERAL_CODES);
+ ADD(3, alpha_, NUM_LITERAL_CODES);
+ ADD(4, distance_, NUM_DISTANCE_CODES);
+ for (i = 0; i < 5; ++i) {
+ out->is_used_[i] = (a->is_used_[i] | b->is_used_[i]);
+ }
+ } else {
+ ADD_EQ(0, literal_, literal_size);
+ ADD_EQ(1, red_, NUM_LITERAL_CODES);
+ ADD_EQ(2, blue_, NUM_LITERAL_CODES);
+ ADD_EQ(3, alpha_, NUM_LITERAL_CODES);
+ ADD_EQ(4, distance_, NUM_DISTANCE_CODES);
+ for (i = 0; i < 5; ++i) out->is_used_[i] |= a->is_used_[i];
+ }
+}
+#undef ADD
+#undef ADD_EQ
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
+ int i;
+ for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
+ (void)upper;
+}
+
+static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
+ int i;
+ for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
+ (void)upper;
+}
+
+// It subtracts the prediction from the input pixel and stores the residual
+// in the output pixel.
+#define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \
+static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \
+ const uint32_t* upper, \
+ int num_pixels, uint32_t* out) { \
+ int x; \
+ assert(upper != NULL); \
+ for (x = 0; x < num_pixels; ++x) { \
+ const uint32_t pred = \
+ VP8LPredictor##PREDICTOR_I##_C(&in[x - 1], upper + x); \
+ out[x] = VP8LSubPixels(in[x], pred); \
+ } \
+}
+
+GENERATE_PREDICTOR_SUB(2)
+GENERATE_PREDICTOR_SUB(3)
+GENERATE_PREDICTOR_SUB(4)
+GENERATE_PREDICTOR_SUB(5)
+GENERATE_PREDICTOR_SUB(6)
+GENERATE_PREDICTOR_SUB(7)
+GENERATE_PREDICTOR_SUB(8)
+GENERATE_PREDICTOR_SUB(9)
+GENERATE_PREDICTOR_SUB(10)
+GENERATE_PREDICTOR_SUB(11)
+GENERATE_PREDICTOR_SUB(12)
+GENERATE_PREDICTOR_SUB(13)
+
+//------------------------------------------------------------------------------
+
+VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+
+VP8LTransformColorFunc VP8LTransformColor;
+
+VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
+
+VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+VP8LCostFunc VP8LExtraCost;
+VP8LCostCombinedFunc VP8LExtraCostCombined;
+VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
+
+VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
+VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
+
+VP8LAddVectorFunc VP8LAddVector;
+VP8LAddVectorEqFunc VP8LAddVectorEq;
+
+VP8LVectorMismatchFunc VP8LVectorMismatch;
+VP8LBundleColorMapFunc VP8LBundleColorMap;
+
+VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
+VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
+
+extern void VP8LEncDspInitSSE2(void);
+extern void VP8LEncDspInitSSE41(void);
+extern void VP8LEncDspInitNEON(void);
+extern void VP8LEncDspInitMIPS32(void);
+extern void VP8LEncDspInitMIPSdspR2(void);
+extern void VP8LEncDspInitMSA(void);
+
+WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
+ VP8LDspInit();
+
+#if !WEBP_NEON_OMIT_C_CODE
+ VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
+
+ VP8LTransformColor = VP8LTransformColor_C;
+#endif
+
+ VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
+ VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
+
+ VP8LFastLog2Slow = FastLog2Slow_C;
+ VP8LFastSLog2Slow = FastSLog2Slow_C;
+
+ VP8LExtraCost = ExtraCost_C;
+ VP8LExtraCostCombined = ExtraCostCombined_C;
+ VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
+
+ VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
+ VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
+
+ VP8LAddVector = AddVector_C;
+ VP8LAddVectorEq = AddVectorEq_C;
+
+ VP8LVectorMismatch = VectorMismatch_C;
+ VP8LBundleColorMap = VP8LBundleColorMap_C;
+
+ VP8LPredictorsSub[0] = PredictorSub0_C;
+ VP8LPredictorsSub[1] = PredictorSub1_C;
+ VP8LPredictorsSub[2] = PredictorSub2_C;
+ VP8LPredictorsSub[3] = PredictorSub3_C;
+ VP8LPredictorsSub[4] = PredictorSub4_C;
+ VP8LPredictorsSub[5] = PredictorSub5_C;
+ VP8LPredictorsSub[6] = PredictorSub6_C;
+ VP8LPredictorsSub[7] = PredictorSub7_C;
+ VP8LPredictorsSub[8] = PredictorSub8_C;
+ VP8LPredictorsSub[9] = PredictorSub9_C;
+ VP8LPredictorsSub[10] = PredictorSub10_C;
+ VP8LPredictorsSub[11] = PredictorSub11_C;
+ VP8LPredictorsSub[12] = PredictorSub12_C;
+ VP8LPredictorsSub[13] = PredictorSub13_C;
+ VP8LPredictorsSub[14] = PredictorSub0_C; // <- padding security sentinels
+ VP8LPredictorsSub[15] = PredictorSub0_C;
+
+ VP8LPredictorsSub_C[0] = PredictorSub0_C;
+ VP8LPredictorsSub_C[1] = PredictorSub1_C;
+ VP8LPredictorsSub_C[2] = PredictorSub2_C;
+ VP8LPredictorsSub_C[3] = PredictorSub3_C;
+ VP8LPredictorsSub_C[4] = PredictorSub4_C;
+ VP8LPredictorsSub_C[5] = PredictorSub5_C;
+ VP8LPredictorsSub_C[6] = PredictorSub6_C;
+ VP8LPredictorsSub_C[7] = PredictorSub7_C;
+ VP8LPredictorsSub_C[8] = PredictorSub8_C;
+ VP8LPredictorsSub_C[9] = PredictorSub9_C;
+ VP8LPredictorsSub_C[10] = PredictorSub10_C;
+ VP8LPredictorsSub_C[11] = PredictorSub11_C;
+ VP8LPredictorsSub_C[12] = PredictorSub12_C;
+ VP8LPredictorsSub_C[13] = PredictorSub13_C;
+ VP8LPredictorsSub_C[14] = PredictorSub0_C; // <- padding security sentinels
+ VP8LPredictorsSub_C[15] = PredictorSub0_C;
+
+ // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+ if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+ if (VP8GetCPUInfo(kSSE2)) {
+ VP8LEncDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+ if (VP8GetCPUInfo(kSSE4_1)) {
+ VP8LEncDspInitSSE41();
+ }
+#endif
+ }
+#endif
+#if defined(WEBP_USE_MIPS32)
+ if (VP8GetCPUInfo(kMIPS32)) {
+ VP8LEncDspInitMIPS32();
+ }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ VP8LEncDspInitMIPSdspR2();
+ }
+#endif
+#if defined(WEBP_USE_MSA)
+ if (VP8GetCPUInfo(kMSA)) {
+ VP8LEncDspInitMSA();
+ }
+#endif
+ }
+
+#if defined(WEBP_HAVE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ VP8LEncDspInitNEON();
+ }
+#endif
+
+ assert(VP8LSubtractGreenFromBlueAndRed != NULL);
+ assert(VP8LTransformColor != NULL);
+ assert(VP8LCollectColorBlueTransforms != NULL);
+ assert(VP8LCollectColorRedTransforms != NULL);
+ assert(VP8LFastLog2Slow != NULL);
+ assert(VP8LFastSLog2Slow != NULL);
+ assert(VP8LExtraCost != NULL);
+ assert(VP8LExtraCostCombined != NULL);
+ assert(VP8LCombinedShannonEntropy != NULL);
+ assert(VP8LGetEntropyUnrefined != NULL);
+ assert(VP8LGetCombinedEntropyUnrefined != NULL);
+ assert(VP8LAddVector != NULL);
+ assert(VP8LAddVectorEq != NULL);
+ assert(VP8LVectorMismatch != NULL);
+ assert(VP8LBundleColorMap != NULL);
+ assert(VP8LPredictorsSub[0] != NULL);
+ assert(VP8LPredictorsSub[1] != NULL);
+ assert(VP8LPredictorsSub[2] != NULL);
+ assert(VP8LPredictorsSub[3] != NULL);
+ assert(VP8LPredictorsSub[4] != NULL);
+ assert(VP8LPredictorsSub[5] != NULL);
+ assert(VP8LPredictorsSub[6] != NULL);
+ assert(VP8LPredictorsSub[7] != NULL);
+ assert(VP8LPredictorsSub[8] != NULL);
+ assert(VP8LPredictorsSub[9] != NULL);
+ assert(VP8LPredictorsSub[10] != NULL);
+ assert(VP8LPredictorsSub[11] != NULL);
+ assert(VP8LPredictorsSub[12] != NULL);
+ assert(VP8LPredictorsSub[13] != NULL);
+ assert(VP8LPredictorsSub[14] != NULL);
+ assert(VP8LPredictorsSub[15] != NULL);
+ assert(VP8LPredictorsSub_C[0] != NULL);
+ assert(VP8LPredictorsSub_C[1] != NULL);
+ assert(VP8LPredictorsSub_C[2] != NULL);
+ assert(VP8LPredictorsSub_C[3] != NULL);
+ assert(VP8LPredictorsSub_C[4] != NULL);
+ assert(VP8LPredictorsSub_C[5] != NULL);
+ assert(VP8LPredictorsSub_C[6] != NULL);
+ assert(VP8LPredictorsSub_C[7] != NULL);
+ assert(VP8LPredictorsSub_C[8] != NULL);
+ assert(VP8LPredictorsSub_C[9] != NULL);
+ assert(VP8LPredictorsSub_C[10] != NULL);
+ assert(VP8LPredictorsSub_C[11] != NULL);
+ assert(VP8LPredictorsSub_C[12] != NULL);
+ assert(VP8LPredictorsSub_C[13] != NULL);
+ assert(VP8LPredictorsSub_C[14] != NULL);
+ assert(VP8LPredictorsSub_C[15] != NULL);
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/dsp/lossless_enc_mips32.c b/media/libwebp/dsp/lossless_enc_mips32.c
new file mode 100644
index 0000000000..088e608b44
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_mips32.c
@@ -0,0 +1,397 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of lossless functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+static float FastSLog2Slow_MIPS32(uint32_t v) {
+ assert(v >= LOG_LOOKUP_IDX_MAX);
+ if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+ uint32_t log_cnt, y, correction;
+ const int c24 = 24;
+ const float v_f = (float)v;
+ uint32_t temp;
+
+ // Xf = 256 = 2^8
+ // log_cnt is index of leading one in upper 24 bits
+ __asm__ volatile(
+ "clz %[log_cnt], %[v] \n\t"
+ "addiu %[y], $zero, 1 \n\t"
+ "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
+ "sllv %[y], %[y], %[log_cnt] \n\t"
+ "srlv %[temp], %[v], %[log_cnt] \n\t"
+ : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+ [temp]"=r"(temp)
+ : [c24]"r"(c24), [v]"r"(v)
+ );
+
+ // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+ // Xf = floor(Xf) * (1 + (v % y) / v)
+ // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+ // The correction factor: log(1 + d) ~ d; for very small d values, so
+ // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+ // LOG_2_RECIPROCAL ~ 23/16
+
+ // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
+ correction = (23 * (v & (y - 1))) >> 4;
+ return v_f * (kLog2Table[temp] + log_cnt) + correction;
+ } else {
+ return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+ }
+}
+
+static float FastLog2Slow_MIPS32(uint32_t v) {
+ assert(v >= LOG_LOOKUP_IDX_MAX);
+ if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+ uint32_t log_cnt, y;
+ const int c24 = 24;
+ double log_2;
+ uint32_t temp;
+
+ __asm__ volatile(
+ "clz %[log_cnt], %[v] \n\t"
+ "addiu %[y], $zero, 1 \n\t"
+ "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
+ "sllv %[y], %[y], %[log_cnt] \n\t"
+ "srlv %[temp], %[v], %[log_cnt] \n\t"
+ : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+ [temp]"=r"(temp)
+ : [c24]"r"(c24), [v]"r"(v)
+ );
+
+ log_2 = kLog2Table[temp] + log_cnt;
+ if (v >= APPROX_LOG_MAX) {
+ // Since the division is still expensive, add this correction factor only
+ // for large values of 'v'.
+
+ const uint32_t correction = (23 * (v & (y - 1))) >> 4;
+ log_2 += (double)correction / v;
+ }
+ return (float)log_2;
+ } else {
+ return (float)(LOG_2_RECIPROCAL * log((double)v));
+ }
+}
+
+// C version of this function:
+// int i = 0;
+// int64_t cost = 0;
+// const uint32_t* pop = &population[4];
+// const uint32_t* LoopEnd = &population[length];
+// while (pop != LoopEnd) {
+// ++i;
+// cost += i * *pop;
+// cost += i * *(pop + 1);
+// pop += 2;
+// }
+// return (double)cost;
+static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
+ int i, temp0, temp1;
+ const uint32_t* pop = &population[4];
+ const uint32_t* const LoopEnd = &population[length];
+
+ __asm__ volatile(
+ "mult $zero, $zero \n\t"
+ "xor %[i], %[i], %[i] \n\t"
+ "beq %[pop], %[LoopEnd], 2f \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[pop]) \n\t"
+ "lw %[temp1], 4(%[pop]) \n\t"
+ "addiu %[i], %[i], 1 \n\t"
+ "addiu %[pop], %[pop], 8 \n\t"
+ "madd %[i], %[temp0] \n\t"
+ "madd %[i], %[temp1] \n\t"
+ "bne %[pop], %[LoopEnd], 1b \n\t"
+ "2: \n\t"
+ "mfhi %[temp0] \n\t"
+ "mflo %[temp1] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+ [i]"=&r"(i), [pop]"+r"(pop)
+ : [LoopEnd]"r"(LoopEnd)
+ : "memory", "hi", "lo"
+ );
+
+ return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+// C version of this function:
+// int i = 0;
+// int64_t cost = 0;
+// const uint32_t* pX = &X[4];
+// const uint32_t* pY = &Y[4];
+// const uint32_t* LoopEnd = &X[length];
+// while (pX != LoopEnd) {
+// const uint32_t xy0 = *pX + *pY;
+// const uint32_t xy1 = *(pX + 1) + *(pY + 1);
+// ++i;
+// cost += i * xy0;
+// cost += i * xy1;
+// pX += 2;
+// pY += 2;
+// }
+// return (double)cost;
+static double ExtraCostCombined_MIPS32(const uint32_t* const X,
+ const uint32_t* const Y, int length) {
+ int i, temp0, temp1, temp2, temp3;
+ const uint32_t* pX = &X[4];
+ const uint32_t* pY = &Y[4];
+ const uint32_t* const LoopEnd = &X[length];
+
+ __asm__ volatile(
+ "mult $zero, $zero \n\t"
+ "xor %[i], %[i], %[i] \n\t"
+ "beq %[pX], %[LoopEnd], 2f \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[pX]) \n\t"
+ "lw %[temp1], 0(%[pY]) \n\t"
+ "lw %[temp2], 4(%[pX]) \n\t"
+ "lw %[temp3], 4(%[pY]) \n\t"
+ "addiu %[i], %[i], 1 \n\t"
+ "addu %[temp0], %[temp0], %[temp1] \n\t"
+ "addu %[temp2], %[temp2], %[temp3] \n\t"
+ "addiu %[pX], %[pX], 8 \n\t"
+ "addiu %[pY], %[pY], 8 \n\t"
+ "madd %[i], %[temp0] \n\t"
+ "madd %[i], %[temp2] \n\t"
+ "bne %[pX], %[LoopEnd], 1b \n\t"
+ "2: \n\t"
+ "mfhi %[temp0] \n\t"
+ "mflo %[temp1] \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+ [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
+ : [LoopEnd]"r"(LoopEnd)
+ : "memory", "hi", "lo"
+ );
+
+ return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+#define HUFFMAN_COST_PASS \
+ __asm__ volatile( \
+ "sll %[temp1], %[temp0], 3 \n\t" \
+ "addiu %[temp3], %[streak], -3 \n\t" \
+ "addu %[temp2], %[pstreaks], %[temp1] \n\t" \
+ "blez %[temp3], 1f \n\t" \
+ "srl %[temp1], %[temp1], 1 \n\t" \
+ "addu %[temp3], %[pcnts], %[temp1] \n\t" \
+ "lw %[temp0], 4(%[temp2]) \n\t" \
+ "lw %[temp1], 0(%[temp3]) \n\t" \
+ "addu %[temp0], %[temp0], %[streak] \n\t" \
+ "addiu %[temp1], %[temp1], 1 \n\t" \
+ "sw %[temp0], 4(%[temp2]) \n\t" \
+ "sw %[temp1], 0(%[temp3]) \n\t" \
+ "b 2f \n\t" \
+ "1: \n\t" \
+ "lw %[temp0], 0(%[temp2]) \n\t" \
+ "addu %[temp0], %[temp0], %[streak] \n\t" \
+ "sw %[temp0], 0(%[temp2]) \n\t" \
+ "2: \n\t" \
+ : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
+ [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \
+ : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \
+ [streak]"r"(streak) \
+ : "memory" \
+ );
+
+// Returns the various RLE counts
+static WEBP_INLINE void GetEntropyUnrefinedHelper(
+ uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+ VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+ int* const pstreaks = &stats->streaks[0][0];
+ int* const pcnts = &stats->counts[0];
+ int temp0, temp1, temp2, temp3;
+ const int streak = i - *i_prev;
+
+ // Gather info for the bit entropy.
+ if (*val_prev != 0) {
+ bit_entropy->sum += (*val_prev) * streak;
+ bit_entropy->nonzeros += streak;
+ bit_entropy->nonzero_code = *i_prev;
+ bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+ if (bit_entropy->max_val < *val_prev) {
+ bit_entropy->max_val = *val_prev;
+ }
+ }
+
+ // Gather info for the Huffman cost.
+ temp0 = (*val_prev != 0);
+ HUFFMAN_COST_PASS
+
+ *val_prev = val;
+ *i_prev = i;
+}
+
+static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
+ VP8LBitEntropy* const bit_entropy,
+ VP8LStreaks* const stats) {
+ int i;
+ int i_prev = 0;
+ uint32_t x_prev = X[0];
+
+ memset(stats, 0, sizeof(*stats));
+ VP8LBitEntropyInit(bit_entropy);
+
+ for (i = 1; i < length; ++i) {
+ const uint32_t x = X[i];
+ if (x != x_prev) {
+ GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+ }
+ }
+ GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+
+ bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
+ const uint32_t Y[],
+ int length,
+ VP8LBitEntropy* const entropy,
+ VP8LStreaks* const stats) {
+ int i = 1;
+ int i_prev = 0;
+ uint32_t xy_prev = X[0] + Y[0];
+
+ memset(stats, 0, sizeof(*stats));
+ VP8LBitEntropyInit(entropy);
+
+ for (i = 1; i < length; ++i) {
+ const uint32_t xy = X[i] + Y[i];
+ if (xy != xy_prev) {
+ GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
+ }
+ }
+ GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
+
+ entropy->entropy += VP8LFastSLog2(entropy->sum);
+}
+
+#define ASM_START \
+ __asm__ volatile( \
+ ".set push \n\t" \
+ ".set at \n\t" \
+ ".set macro \n\t" \
+ "1: \n\t"
+
+// P2 = P0 + P1
+// A..D - offsets
+// E - temp variable to tell macro
+// if pointer should be incremented
+// literal_ and successive histograms could be unaligned
+// so we must use ulw and usw
+#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \
+ "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \
+ "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \
+ "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \
+ "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \
+ "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \
+ "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \
+ "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \
+ "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \
+ "addu %[temp4], %[temp4], %[temp0] \n\t" \
+ "addu %[temp5], %[temp5], %[temp1] \n\t" \
+ "addu %[temp6], %[temp6], %[temp2] \n\t" \
+ "addu %[temp7], %[temp7], %[temp3] \n\t" \
+ "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \
+ ".if " #E " == 1 \n\t" \
+ "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \
+ ".endif \n\t" \
+ "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \
+ "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \
+ "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \
+ "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \
+ "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \
+ "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \
+ ".set pop \n\t" \
+
+#define ASM_END_COMMON_0 \
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \
+ [pa]"+r"(pa), [pout]"+r"(pout)
+
+#define ASM_END_COMMON_1 \
+ : [LoopEnd]"r"(LoopEnd) \
+ : "memory", "at" \
+ );
+
+#define ASM_END_0 \
+ ASM_END_COMMON_0 \
+ , [pb]"+r"(pb) \
+ ASM_END_COMMON_1
+
+#define ASM_END_1 \
+ ASM_END_COMMON_0 \
+ ASM_END_COMMON_1
+
+static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
+ uint32_t* pout, int size) {
+ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ const uint32_t end = ((size) / 4) * 4;
+ const uint32_t* const LoopEnd = pa + end;
+ int i;
+ ASM_START
+ ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
+ ASM_END_0
+ for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
+}
+
+static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
+ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ const uint32_t end = ((size) / 4) * 4;
+ const uint32_t* const LoopEnd = pa + end;
+ int i;
+ ASM_START
+ ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
+ ASM_END_1
+ for (i = end; i < size; ++i) pout[i] += pa[i];
+}
+
+#undef ASM_END_1
+#undef ASM_END_0
+#undef ASM_END_COMMON_1
+#undef ASM_END_COMMON_0
+#undef ADD_TO_OUT
+#undef ASM_START
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
+ VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
+ VP8LFastLog2Slow = FastLog2Slow_MIPS32;
+ VP8LExtraCost = ExtraCost_MIPS32;
+ VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
+ VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
+ VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
+ VP8LAddVector = AddVector_MIPS32;
+ VP8LAddVectorEq = AddVectorEq_MIPS32;
+}
+
+#else // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
+
+#endif // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/lossless_enc_mips_dsp_r2.c b/media/libwebp/dsp/lossless_enc_mips_dsp_r2.c
new file mode 100644
index 0000000000..157dfc2e01
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_mips_dsp_r2.c
@@ -0,0 +1,281 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/lossless.h"
+
+static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
+ int num_pixels) {
+ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
+ uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beq %[argb_data], %[p_loop1_end], 3f \n\t"
+ " nop \n\t"
+ "0: \n\t"
+ "lw %[temp0], 0(%[argb_data]) \n\t"
+ "lw %[temp1], 4(%[argb_data]) \n\t"
+ "lw %[temp2], 8(%[argb_data]) \n\t"
+ "lw %[temp3], 12(%[argb_data]) \n\t"
+ "ext %[temp4], %[temp0], 8, 8 \n\t"
+ "ext %[temp5], %[temp1], 8, 8 \n\t"
+ "ext %[temp6], %[temp2], 8, 8 \n\t"
+ "ext %[temp7], %[temp3], 8, 8 \n\t"
+ "addiu %[argb_data], %[argb_data], 16 \n\t"
+ "replv.ph %[temp4], %[temp4] \n\t"
+ "replv.ph %[temp5], %[temp5] \n\t"
+ "replv.ph %[temp6], %[temp6] \n\t"
+ "replv.ph %[temp7], %[temp7] \n\t"
+ "subu.qb %[temp0], %[temp0], %[temp4] \n\t"
+ "subu.qb %[temp1], %[temp1], %[temp5] \n\t"
+ "subu.qb %[temp2], %[temp2], %[temp6] \n\t"
+ "subu.qb %[temp3], %[temp3], %[temp7] \n\t"
+ "sw %[temp0], -16(%[argb_data]) \n\t"
+ "sw %[temp1], -12(%[argb_data]) \n\t"
+ "sw %[temp2], -8(%[argb_data]) \n\t"
+ "bne %[argb_data], %[p_loop1_end], 0b \n\t"
+ " sw %[temp3], -4(%[argb_data]) \n\t"
+ "3: \n\t"
+ "beq %[argb_data], %[p_loop2_end], 2f \n\t"
+ " nop \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[argb_data]) \n\t"
+ "addiu %[argb_data], %[argb_data], 4 \n\t"
+ "ext %[temp4], %[temp0], 8, 8 \n\t"
+ "replv.ph %[temp4], %[temp4] \n\t"
+ "subu.qb %[temp0], %[temp0], %[temp4] \n\t"
+ "bne %[argb_data], %[p_loop2_end], 1b \n\t"
+ " sw %[temp0], -4(%[argb_data]) \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
+ [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+ [temp7]"=&r"(temp7)
+ : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+ : "memory"
+ );
+}
+
+static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
+ int8_t color) {
+ return (uint32_t)((int)(color_pred) * color) >> 5;
+}
+
+static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
+ uint32_t* data, int num_pixels) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ uint32_t argb, argb1, new_red, new_red1;
+ const uint32_t G_to_R = m->green_to_red_;
+ const uint32_t G_to_B = m->green_to_blue_;
+ const uint32_t R_to_B = m->red_to_blue_;
+ uint32_t* const p_loop_end = data + (num_pixels & ~1);
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beq %[data], %[p_loop_end], 1f \n\t"
+ " nop \n\t"
+ "replv.ph %[temp0], %[G_to_R] \n\t"
+ "replv.ph %[temp1], %[G_to_B] \n\t"
+ "replv.ph %[temp2], %[R_to_B] \n\t"
+ "shll.ph %[temp0], %[temp0], 8 \n\t"
+ "shll.ph %[temp1], %[temp1], 8 \n\t"
+ "shll.ph %[temp2], %[temp2], 8 \n\t"
+ "shra.ph %[temp0], %[temp0], 8 \n\t"
+ "shra.ph %[temp1], %[temp1], 8 \n\t"
+ "shra.ph %[temp2], %[temp2], 8 \n\t"
+ "0: \n\t"
+ "lw %[argb], 0(%[data]) \n\t"
+ "lw %[argb1], 4(%[data]) \n\t"
+ "lhu %[new_red], 2(%[data]) \n\t"
+ "lhu %[new_red1], 6(%[data]) \n\t"
+ "precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t"
+ "precr.qb.ph %[temp4], %[argb], %[argb1] \n\t"
+ "preceu.ph.qbra %[temp3], %[temp3] \n\t"
+ "preceu.ph.qbla %[temp4], %[temp4] \n\t"
+ "shll.ph %[temp3], %[temp3], 8 \n\t"
+ "shll.ph %[temp4], %[temp4], 8 \n\t"
+ "shra.ph %[temp3], %[temp3], 8 \n\t"
+ "shra.ph %[temp4], %[temp4], 8 \n\t"
+ "mul.ph %[temp5], %[temp3], %[temp0] \n\t"
+ "mul.ph %[temp3], %[temp3], %[temp1] \n\t"
+ "mul.ph %[temp4], %[temp4], %[temp2] \n\t"
+ "addiu %[data], %[data], 8 \n\t"
+ "ins %[new_red1], %[new_red], 16, 16 \n\t"
+ "ins %[argb1], %[argb], 16, 16 \n\t"
+ "shra.ph %[temp5], %[temp5], 5 \n\t"
+ "shra.ph %[temp3], %[temp3], 5 \n\t"
+ "shra.ph %[temp4], %[temp4], 5 \n\t"
+ "subu.ph %[new_red1], %[new_red1], %[temp5] \n\t"
+ "subu.ph %[argb1], %[argb1], %[temp3] \n\t"
+ "preceu.ph.qbra %[temp5], %[new_red1] \n\t"
+ "subu.ph %[argb1], %[argb1], %[temp4] \n\t"
+ "preceu.ph.qbra %[temp3], %[argb1] \n\t"
+ "sb %[temp5], -2(%[data]) \n\t"
+ "sb %[temp3], -4(%[data]) \n\t"
+ "sra %[temp5], %[temp5], 16 \n\t"
+ "sra %[temp3], %[temp3], 16 \n\t"
+ "sb %[temp5], -6(%[data]) \n\t"
+ "bne %[data], %[p_loop_end], 0b \n\t"
+ " sb %[temp3], -8(%[data]) \n\t"
+ "1: \n\t"
+ ".set pop \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
+ [argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
+ : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+ [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+ : "memory", "hi", "lo"
+ );
+
+ if (num_pixels & 1) {
+ const uint32_t argb_ = data[0];
+ const uint32_t green = argb_ >> 8;
+ const uint32_t red = argb_ >> 16;
+ uint32_t new_blue = argb_;
+ new_red = red;
+ new_red -= ColorTransformDelta(m->green_to_red_, green);
+ new_red &= 0xff;
+ new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+ new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+ new_blue &= 0xff;
+ data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
+ }
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+ uint8_t red_to_blue,
+ uint32_t argb) {
+ const uint32_t green = argb >> 8;
+ const uint32_t red = argb >> 16;
+ uint8_t new_blue = argb;
+ new_blue -= ColorTransformDelta(green_to_blue, green);
+ new_blue -= ColorTransformDelta(red_to_blue, red);
+ return (new_blue & 0xff);
+}
+
+static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
+ int stride,
+ int tile_width,
+ int tile_height,
+ int green_to_blue,
+ int red_to_blue,
+ int histo[]) {
+ const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
+ const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
+ const uint32_t mask = 0xff00ffu;
+ while (tile_height-- > 0) {
+ int x;
+ const uint32_t* p_argb = argb;
+ argb += stride;
+ for (x = 0; x < (tile_width >> 1); ++x) {
+ int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+ __asm__ volatile (
+ "lw %[temp0], 0(%[p_argb]) \n\t"
+ "lw %[temp1], 4(%[p_argb]) \n\t"
+ "precr.qb.ph %[temp2], %[temp0], %[temp1] \n\t"
+ "ins %[temp1], %[temp0], 16, 16 \n\t"
+ "shra.ph %[temp2], %[temp2], 8 \n\t"
+ "shra.ph %[temp3], %[temp1], 8 \n\t"
+ "mul.ph %[temp5], %[temp2], %[rtb] \n\t"
+ "mul.ph %[temp6], %[temp3], %[gtb] \n\t"
+ "and %[temp4], %[temp1], %[mask] \n\t"
+ "addiu %[p_argb], %[p_argb], 8 \n\t"
+ "shra.ph %[temp5], %[temp5], 5 \n\t"
+ "shra.ph %[temp6], %[temp6], 5 \n\t"
+ "subu.qb %[temp2], %[temp4], %[temp5] \n\t"
+ "subu.qb %[temp2], %[temp2], %[temp6] \n\t"
+ : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+ [temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
+ : [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
+ : "memory", "hi", "lo"
+ );
+ ++histo[(uint8_t)(temp2 >> 16)];
+ ++histo[(uint8_t)temp2];
+ }
+ if (tile_width & 1) {
+ ++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
+ }
+ }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+ uint32_t argb) {
+ const uint32_t green = argb >> 8;
+ uint32_t new_red = argb >> 16;
+ new_red -= ColorTransformDelta(green_to_red, green);
+ return (new_red & 0xff);
+}
+
+static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
+ int stride,
+ int tile_width,
+ int tile_height,
+ int green_to_red,
+ int histo[]) {
+ const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
+ while (tile_height-- > 0) {
+ int x;
+ const uint32_t* p_argb = argb;
+ argb += stride;
+ for (x = 0; x < (tile_width >> 1); ++x) {
+ int temp0, temp1, temp2, temp3, temp4;
+ __asm__ volatile (
+ "lw %[temp0], 0(%[p_argb]) \n\t"
+ "lw %[temp1], 4(%[p_argb]) \n\t"
+ "precrq.ph.w %[temp4], %[temp0], %[temp1] \n\t"
+ "ins %[temp1], %[temp0], 16, 16 \n\t"
+ "shra.ph %[temp3], %[temp1], 8 \n\t"
+ "mul.ph %[temp2], %[temp3], %[gtr] \n\t"
+ "addiu %[p_argb], %[p_argb], 8 \n\t"
+ "shra.ph %[temp2], %[temp2], 5 \n\t"
+ "subu.qb %[temp2], %[temp4], %[temp2] \n\t"
+ : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+ : [gtr]"r"(gtr)
+ : "memory", "hi", "lo"
+ );
+ ++histo[(uint8_t)(temp2 >> 16)];
+ ++histo[(uint8_t)temp2];
+ }
+ if (tile_width & 1) {
+ ++histo[TransformColorRed(green_to_red, *p_argb)];
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
+ VP8LTransformColor = TransformColor_MIPSdspR2;
+ VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
+ VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/lossless_enc_msa.c b/media/libwebp/dsp/lossless_enc_msa.c
new file mode 100644
index 0000000000..f8a5f8c56f
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_msa.c
@@ -0,0 +1,148 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of Image transform methods for lossless encoder.
+//
+// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/lossless.h"
+#include "../dsp/msa_macro.h"
+
+#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do { \
+ v8i16 g0, g1, t0, t1, t2, t3; \
+ v4i32 t4, t5; \
+ VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1); \
+ DOTP_SB2_SH(g0, g1, c0, c0, t0, t1); \
+ SRAI_H2_SH(t0, t1, 5); \
+ t0 = __msa_subv_h((v8i16)src0, t0); \
+ t1 = __msa_subv_h((v8i16)src1, t1); \
+ t4 = __msa_srli_w((v4i32)src0, 16); \
+ t5 = __msa_srli_w((v4i32)src1, 16); \
+ DOTP_SB2_SH(t4, t5, c1, c1, t2, t3); \
+ SRAI_H2_SH(t2, t3, 5); \
+ SUB2(t0, t2, t1, t3, t0, t1); \
+ VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1); \
+} while (0)
+
+#define TRANSFORM_COLOR_4(src, dst, c0, c1, mask0, mask1) do { \
+ const v16i8 g0 = VSHF_SB(src, src, mask0); \
+ v8i16 t0 = __msa_dotp_s_h(c0, g0); \
+ v8i16 t1; \
+ v4i32 t2; \
+ t0 = SRAI_H(t0, 5); \
+ t0 = __msa_subv_h((v8i16)src, t0); \
+ t2 = __msa_srli_w((v4i32)src, 16); \
+ t1 = __msa_dotp_s_h(c1, (v16i8)t2); \
+ t1 = SRAI_H(t1, 5); \
+ t0 = t0 - t1; \
+ dst = VSHF_UB(src, t0, mask1); \
+} while (0)
+
+static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
+ int num_pixels) {
+ v16u8 src0, dst0;
+ const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+ (m->green_to_red_ << 16));
+ const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+ const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+ 13, 255, 13, 255 };
+ const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+ 28, 13, 30, 15 };
+
+ while (num_pixels >= 8) {
+ v16u8 src1, dst1;
+ LD_UB2(data, 4, src0, src1);
+ TRANSFORM_COLOR_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+ ST_UB2(dst0, dst1, data, 4);
+ data += 8;
+ num_pixels -= 8;
+ }
+ if (num_pixels > 0) {
+ if (num_pixels >= 4) {
+ src0 = LD_UB(data);
+ TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+ ST_UB(dst0, data);
+ data += 4;
+ num_pixels -= 4;
+ }
+ if (num_pixels > 0) {
+ src0 = LD_UB(data);
+ TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+ if (num_pixels == 3) {
+ const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+ const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+ SD(pix_d, data + 0);
+ SW(pix_w, data + 2);
+ } else if (num_pixels == 2) {
+ const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+ SD(pix_d, data);
+ } else {
+ const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+ SW(pix_w, data);
+ }
+ }
+ }
+}
+
+static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
+ int num_pixels) {
+ int i;
+ uint8_t* ptemp_data = (uint8_t*)argb_data;
+ v16u8 src0, dst0, tmp0;
+ const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+ 13, 255, 13, 255 };
+
+ while (num_pixels >= 8) {
+ v16u8 src1, dst1, tmp1;
+ LD_UB2(ptemp_data, 16, src0, src1);
+ VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+ SUB2(src0, tmp0, src1, tmp1, dst0, dst1);
+ ST_UB2(dst0, dst1, ptemp_data, 16);
+ ptemp_data += 8 * 4;
+ num_pixels -= 8;
+ }
+ if (num_pixels > 0) {
+ if (num_pixels >= 4) {
+ src0 = LD_UB(ptemp_data);
+ tmp0 = VSHF_UB(src0, src0, mask);
+ dst0 = src0 - tmp0;
+ ST_UB(dst0, ptemp_data);
+ ptemp_data += 4 * 4;
+ num_pixels -= 4;
+ }
+ for (i = 0; i < num_pixels; i++) {
+ const uint8_t b = ptemp_data[0];
+ const uint8_t g = ptemp_data[1];
+ const uint8_t r = ptemp_data[2];
+ ptemp_data[0] = (b - g) & 0xff;
+ ptemp_data[2] = (r - g) & 0xff;
+ ptemp_data += 4;
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
+ VP8LTransformColor = TransformColor_MSA;
+}
+
+#else // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMSA)
+
+#endif // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/lossless_enc_neon.c b/media/libwebp/dsp/lossless_enc_neon.c
new file mode 100644
index 0000000000..89d5439e49
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_neon.c
@@ -0,0 +1,144 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
+#include "../dsp/lossless.h"
+#include "../dsp/neon.h"
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+// non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+ defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+#define USE_VTBLQ
+#endif
+
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+ 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+ const uint8x16_t shuffle) {
+ return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+ vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else // !USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+ const uint8x8_t shuffle) {
+ return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+ vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif // USE_VTBLQ
+
+static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
+ int num_pixels) {
+ const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+ const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
+ const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
+ for (; argb_data < end; argb_data += 4) {
+ const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+ const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
+ vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+ }
+ // fallthrough and finish off with plain-C
+ VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static void TransformColor_NEON(const VP8LMultipliers* const m,
+ uint32_t* argb_data, int num_pixels) {
+ // sign-extended multiplying constants, pre-shifted by 6.
+#define CST(X) (((int16_t)(m->X << 8)) >> 6)
+ const int16_t rb[8] = {
+ CST(green_to_blue_), CST(green_to_red_),
+ CST(green_to_blue_), CST(green_to_red_),
+ CST(green_to_blue_), CST(green_to_red_),
+ CST(green_to_blue_), CST(green_to_red_)
+ };
+ const int16x8_t mults_rb = vld1q_s16(rb);
+ const int16_t b2[8] = {
+ 0, CST(red_to_blue_), 0, CST(red_to_blue_),
+ 0, CST(red_to_blue_), 0, CST(red_to_blue_),
+ };
+ const int16x8_t mults_b2 = vld1q_s16(b2);
+#undef CST
+#ifdef USE_VTBLQ
+ static const uint8_t kg0g0[16] = {
+ 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
+ };
+ const uint8x16_t shuffle = vld1q_u8(kg0g0);
+#else
+ static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
+ const uint8x8_t shuffle = vld1_u8(k0g0g);
+#endif
+ const uint32x4_t mask_rb = vdupq_n_u32(0x00ff00ffu); // red-blue masks
+ int i;
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
+ // 0 g 0 g
+ const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
+ // x dr x db1
+ const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
+ // r 0 b 0
+ const int16x8_t B = vshlq_n_s16(vreinterpretq_s16_u8(in), 8);
+ // x db2 0 0
+ const int16x8_t C = vqdmulhq_s16(B, mults_b2);
+ // 0 0 x db2
+ const uint32x4_t D = vshrq_n_u32(vreinterpretq_u32_s16(C), 16);
+ // x dr x db
+ const int8x16_t E = vaddq_s8(vreinterpretq_s8_u32(D),
+ vreinterpretq_s8_s16(A));
+ // 0 dr 0 db
+ const uint32x4_t F = vandq_u32(vreinterpretq_u32_s8(E), mask_rb);
+ const int8x16_t out = vsubq_s8(vreinterpretq_s8_u8(in),
+ vreinterpretq_s8_u32(F));
+ vst1q_s8((int8_t*)(argb_data + i), out);
+ }
+ // fallthrough and finish off with plain-C
+ VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+#undef USE_VTBLQ
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
+ VP8LTransformColor = TransformColor_NEON;
+}
+
+#else // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitNEON)
+
+#endif // WEBP_USE_NEON
diff --git a/media/libwebp/dsp/lossless_enc_sse2.c b/media/libwebp/dsp/lossless_enc_sse2.c
new file mode 100644
index 0000000000..665ceb669c
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_sse2.c
@@ -0,0 +1,669 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <assert.h>
+#include <emmintrin.h>
+#include "../dsp/lossless.h"
+#include "../dsp/common_sse2.h"
+#include "../dsp/lossless_common.h"
+
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
+ int num_pixels) {
+ int i;
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+ const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
+ const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
+ const __m128i out = _mm_sub_epi8(in, C);
+ _mm_storeu_si128((__m128i*)&argb_data[i], out);
+ }
+ // fallthrough and finish off with plain-C
+ if (i != num_pixels) {
+ VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+#define MK_CST_16(HI, LO) \
+ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
+static void TransformColor_SSE2(const VP8LMultipliers* const m,
+ uint32_t* argb_data, int num_pixels) {
+ const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
+ CST_5b(m->green_to_blue_));
+ const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
+ const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
+ const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks
+ int i;
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+ const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
+ const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
+ const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
+ const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0
+ const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0
+ const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2
+ const __m128i H = _mm_add_epi8(G, D); // x dr x db
+ const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db
+ const __m128i out = _mm_sub_epi8(in, I);
+ _mm_storeu_si128((__m128i*)&argb_data[i], out);
+ }
+ // fallthrough and finish off with plain-C
+ if (i != num_pixels) {
+ VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+ }
+}
+
+//------------------------------------------------------------------------------
+#define SPAN 8
+static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_blue, int red_to_blue,
+ int histo[]) {
+ const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
+ const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
+ const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
+ const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask
+ int y;
+ for (y = 0; y < tile_height; ++y) {
+ const uint32_t* const src = argb + y * stride;
+ int i, x;
+ for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+ uint16_t values[SPAN];
+ const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
+ const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+ const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0
+ const __m128i A1 = _mm_slli_epi16(in1, 8);
+ const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
+ const __m128i B1 = _mm_and_si128(in1, mask_g);
+ const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0
+ const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
+ const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db
+ const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
+ const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b'
+ const __m128i E1 = _mm_sub_epi8(in1, D1);
+ const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db
+ const __m128i F1 = _mm_srli_epi32(C1, 16);
+ const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b'
+ const __m128i G1 = _mm_sub_epi8(E1, F1);
+ const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b
+ const __m128i H1 = _mm_and_si128(G1, mask_b);
+ const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b'
+ _mm_storeu_si128((__m128i*)values, I);
+ for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+ }
+ }
+ {
+ const int left_over = tile_width & (SPAN - 1);
+ if (left_over > 0) {
+ VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+ left_over, tile_height,
+ green_to_blue, red_to_blue, histo);
+ }
+ }
+}
+
+static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_red, int histo[]) {
+ const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
+ const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
+ const __m128i mask = _mm_set1_epi32(0xff);
+
+ int y;
+ for (y = 0; y < tile_height; ++y) {
+ const uint32_t* const src = argb + y * stride;
+ int i, x;
+ for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+ uint16_t values[SPAN];
+ const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
+ const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+ const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
+ const __m128i A1 = _mm_and_si128(in1, mask_g);
+ const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
+ const __m128i B1 = _mm_srli_epi32(in1, 16);
+ const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr
+ const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
+ const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r'
+ const __m128i E1 = _mm_sub_epi8(B1, C1);
+ const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r'
+ const __m128i F1 = _mm_and_si128(E1, mask);
+ const __m128i I = _mm_packs_epi32(F0, F1);
+ _mm_storeu_si128((__m128i*)values, I);
+ for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+ }
+ }
+ {
+ const int left_over = tile_width & (SPAN - 1);
+ if (left_over > 0) {
+ VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+ left_over, tile_height,
+ green_to_red, histo);
+ }
+ }
+}
+#undef SPAN
+#undef MK_CST_16
+
+//------------------------------------------------------------------------------
+
+// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+// that's ok since the histogram values are less than 1<<28 (max picture size).
+#define LINE_SIZE 16 // 8 or 16
+static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
+ int size) {
+ int i;
+ for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
+#if (LINE_SIZE == 16)
+ const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
+ const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+ const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
+ const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
+#if (LINE_SIZE == 16)
+ const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]);
+ const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
+#endif
+ _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
+ _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+ _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
+ _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+ }
+ for (; i < size; ++i) {
+ out[i] = a[i] + b[i];
+ }
+}
+
+static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
+ int i;
+ for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
+#if (LINE_SIZE == 16)
+ const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
+ const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+ const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
+ const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
+#if (LINE_SIZE == 16)
+ const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]);
+ const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
+#endif
+ _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
+ _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+ _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
+ _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+ }
+ for (; i < size; ++i) {
+ out[i] += a[i];
+ }
+}
+#undef LINE_SIZE
+
+//------------------------------------------------------------------------------
+// Entropy
+
+// TODO(https://crbug.com/webp/499): this function produces different results
+// from the C code due to use of double/float resulting in output differences
+// when compared to -noasm.
+#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
+
+static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
+ int i;
+ double retval = 0.;
+ int sumX = 0, sumXY = 0;
+ const __m128i zero = _mm_setzero_si128();
+
+ for (i = 0; i < 256; i += 16) {
+ const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i + 0));
+ const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i + 0));
+ const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i + 4));
+ const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i + 4));
+ const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i + 8));
+ const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i + 8));
+ const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
+ const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
+ const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
+ _mm_packs_epi32(x2, x3));
+ const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
+ _mm_packs_epi32(y2, y3));
+ const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
+ int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
+ while (my) {
+ const int32_t j = BitsCtz(my);
+ int xy;
+ if ((mx >> j) & 1) {
+ const int x = X[i + j];
+ sumXY += x;
+ retval -= VP8LFastSLog2(x);
+ }
+ xy = X[i + j] + Y[i + j];
+ sumX += xy;
+ retval -= VP8LFastSLog2(xy);
+ my &= my - 1;
+ }
+ }
+ retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+ return (float)retval;
+}
+
+#else
+
+#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC // won't be faster
+
+#endif
+
+//------------------------------------------------------------------------------
+
+static int VectorMismatch_SSE2(const uint32_t* const array1,
+ const uint32_t* const array2, int length) {
+ int match_len;
+
+ if (length >= 12) {
+ __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
+ __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
+ match_len = 0;
+ do {
+ // Loop unrolling and early load both provide a speedup of 10% for the
+ // current function. Also, max_limit can be MAX_LENGTH=4096 at most.
+ const __m128i cmpA = _mm_cmpeq_epi32(A0, A1);
+ const __m128i B0 =
+ _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
+ const __m128i B1 =
+ _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
+ if (_mm_movemask_epi8(cmpA) != 0xffff) break;
+ match_len += 4;
+
+ {
+ const __m128i cmpB = _mm_cmpeq_epi32(B0, B1);
+ A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
+ A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
+ if (_mm_movemask_epi8(cmpB) != 0xffff) break;
+ match_len += 4;
+ }
+ } while (match_len + 12 < length);
+ } else {
+ match_len = 0;
+ // Unroll the potential first two loops.
+ if (length >= 4 &&
+ _mm_movemask_epi8(_mm_cmpeq_epi32(
+ _mm_loadu_si128((const __m128i*)&array1[0]),
+ _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
+ match_len = 4;
+ if (length >= 8 &&
+ _mm_movemask_epi8(_mm_cmpeq_epi32(
+ _mm_loadu_si128((const __m128i*)&array1[4]),
+ _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
+ match_len = 8;
+ }
+ }
+ }
+
+ while (match_len < length && array1[match_len] == array2[match_len]) {
+ ++match_len;
+ }
+ return match_len;
+}
+
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
+ uint32_t* dst) {
+ int x;
+ assert(xbits >= 0);
+ assert(xbits <= 3);
+ switch (xbits) {
+ case 0: {
+ const __m128i ff = _mm_set1_epi16((short)0xff00);
+ const __m128i zero = _mm_setzero_si128();
+ // Store 0xff000000 | (row[x] << 8).
+ for (x = 0; x + 16 <= width; x += 16, dst += 16) {
+ const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+ const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
+ const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
+ const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
+ const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
+ const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
+ const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
+ _mm_storeu_si128((__m128i*)&dst[0], dst0);
+ _mm_storeu_si128((__m128i*)&dst[4], dst1);
+ _mm_storeu_si128((__m128i*)&dst[8], dst2);
+ _mm_storeu_si128((__m128i*)&dst[12], dst3);
+ }
+ break;
+ }
+ case 1: {
+ const __m128i ff = _mm_set1_epi16((short)0xff00);
+ const __m128i mul = _mm_set1_epi16(0x110);
+ for (x = 0; x + 16 <= width; x += 16, dst += 8) {
+ // 0a0b | (where a/b are 4 bits).
+ const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+ const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0
+ const __m128i pack = _mm_and_si128(tmp, ff); // ab00
+ const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
+ const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
+ _mm_storeu_si128((__m128i*)&dst[0], dst0);
+ _mm_storeu_si128((__m128i*)&dst[4], dst1);
+ }
+ break;
+ }
+ case 2: {
+ const __m128i mask_or = _mm_set1_epi32(0xff000000);
+ const __m128i mul_cst = _mm_set1_epi16(0x0104);
+ const __m128i mask_mul = _mm_set1_epi16(0x0f00);
+ for (x = 0; x + 16 <= width; x += 16, dst += 4) {
+ // 000a000b000c000d | (where a/b/c/d are 2 bits).
+ const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+ const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0
+ const __m128i tmp = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000
+ const __m128i shift = _mm_srli_epi32(tmp, 12); // 00000000ab000000
+ const __m128i pack = _mm_or_si128(shift, tmp); // 00000000abcd0000
+ // Convert to 0xff00**00.
+ const __m128i res = _mm_or_si128(pack, mask_or);
+ _mm_storeu_si128((__m128i*)dst, res);
+ }
+ break;
+ }
+ default: {
+ assert(xbits == 3);
+ for (x = 0; x + 16 <= width; x += 16, dst += 2) {
+ // 0000000a00000000b... | (where a/b are 1 bit).
+ const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+ const __m128i shift = _mm_slli_epi64(in, 7);
+ const uint32_t move = _mm_movemask_epi8(shift);
+ dst[0] = 0xff000000 | ((move & 0xff) << 8);
+ dst[1] = 0xff000000 | (move & 0xff00);
+ }
+ break;
+ }
+ }
+ if (x != width) {
+ VP8LBundleColorMap_C(row + x, width - x, xbits, dst);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Batch version of Predictor Transform subtraction
+
+static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
+ const __m128i* const a1,
+ __m128i* const avg) {
+ // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+ const __m128i ones = _mm_set1_epi8(1);
+ const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
+ const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
+ *avg = _mm_sub_epi8(avg1, one);
+}
+
+// Predictor0: ARGB_BLACK.
+static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
+ int i;
+ const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+ const __m128i res = _mm_sub_epi8(src, black);
+ _mm_storeu_si128((__m128i*)&out[i], res);
+ }
+ if (i != num_pixels) {
+ VP8LPredictorsSub_C[0](in + i, NULL, num_pixels - i, out + i);
+ }
+ (void)upper;
+}
+
+#define GENERATE_PREDICTOR_1(X, IN) \
+ static void PredictorSub##X##_SSE2(const uint32_t* const in, \
+ const uint32_t* const upper, \
+ int num_pixels, uint32_t* const out) { \
+ int i; \
+ for (i = 0; i + 4 <= num_pixels; i += 4) { \
+ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
+ const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
+ const __m128i res = _mm_sub_epi8(src, pred); \
+ _mm_storeu_si128((__m128i*)&out[i], res); \
+ } \
+ if (i != num_pixels) { \
+ VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i), \
+ num_pixels - i, out + i); \
+ } \
+ }
+
+GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L
+GENERATE_PREDICTOR_1(2, upper[i]) // Predictor2: T
+GENERATE_PREDICTOR_1(3, upper[i + 1]) // Predictor3: TR
+GENERATE_PREDICTOR_1(4, upper[i - 1]) // Predictor4: TL
+#undef GENERATE_PREDICTOR_1
+
+// Predictor5: avg2(avg2(L, TR), T)
+static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
+ int i;
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+ const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+ const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+ __m128i avg, pred, res;
+ Average2_m128i(&L, &TR, &avg);
+ Average2_m128i(&avg, &T, &pred);
+ res = _mm_sub_epi8(src, pred);
+ _mm_storeu_si128((__m128i*)&out[i], res);
+ }
+ if (i != num_pixels) {
+ VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);
+ }
+}
+
+#define GENERATE_PREDICTOR_2(X, A, B) \
+static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+ int num_pixels, uint32_t* out) { \
+ int i; \
+ for (i = 0; i + 4 <= num_pixels; i += 4) { \
+ const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \
+ const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \
+ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
+ __m128i pred, res; \
+ Average2_m128i(&tA, &tB, &pred); \
+ res = _mm_sub_epi8(src, pred); \
+ _mm_storeu_si128((__m128i*)&out[i], res); \
+ } \
+ if (i != num_pixels) { \
+ VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
+ } \
+}
+
+GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1]) // Predictor6: avg(L, TL)
+GENERATE_PREDICTOR_2(7, in[i - 1], upper[i]) // Predictor7: avg(L, T)
+GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i]) // Predictor8: avg(TL, T)
+GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1]) // Predictor9: average(T, TR)
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: avg(avg(L,TL), avg(T, TR)).
+static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
+ int i;
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+ const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+ const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+ const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+ __m128i avgTTR, avgLTL, avg, res;
+ Average2_m128i(&T, &TR, &avgTTR);
+ Average2_m128i(&L, &TL, &avgLTL);
+ Average2_m128i(&avgTTR, &avgLTL, &avg);
+ res = _mm_sub_epi8(src, avg);
+ _mm_storeu_si128((__m128i*)&out[i], res);
+ }
+ if (i != num_pixels) {
+ VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);
+ }
+}
+
+// Predictor11: select.
+static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
+ __m128i* const out) {
+ // We can unpack with any value on the upper 32 bits, provided it's the same
+ // on both operands (to that their sum of abs diff is zero). Here we use *A.
+ const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
+ const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
+ const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
+ const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
+ const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
+ const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
+ *out = _mm_packs_epi32(s_lo, s_hi);
+}
+
+static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
+ int i;
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+ const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+ const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+ __m128i pa, pb;
+ GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL|
+ GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL|
+ {
+ const __m128i mask = _mm_cmpgt_epi32(pb, pa);
+ const __m128i A = _mm_and_si128(mask, L);
+ const __m128i B = _mm_andnot_si128(mask, T);
+ const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
+ const __m128i res = _mm_sub_epi8(src, pred);
+ _mm_storeu_si128((__m128i*)&out[i], res);
+ }
+ }
+ if (i != num_pixels) {
+ VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
+ }
+}
+
+// Predictor12: ClampedSubSubtractFull.
+static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+ const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+ const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+ const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
+ const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+ const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+ const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
+ const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+ const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+ const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
+ const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
+ const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
+ const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);
+ const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);
+ const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+ const __m128i res = _mm_sub_epi8(src, pred);
+ _mm_storeu_si128((__m128i*)&out[i], res);
+ }
+ if (i != num_pixels) {
+ VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);
+ }
+}
+
+// Predictors13: ClampedAddSubtractHalf
+static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ for (i = 0; i + 2 <= num_pixels; i += 2) {
+ // we can only process two pixels at a time
+ const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
+ const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
+ const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
+ const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
+ const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+ const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+ const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+ const __m128i sum = _mm_add_epi16(T_lo, L_lo);
+ const __m128i avg = _mm_srli_epi16(sum, 1);
+ const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
+ const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
+ const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
+ const __m128i A3 = _mm_srai_epi16(A2, 1);
+ const __m128i A4 = _mm_add_epi16(avg, A3);
+ const __m128i pred = _mm_packus_epi16(A4, A4);
+ const __m128i res = _mm_sub_epi8(src, pred);
+ _mm_storel_epi64((__m128i*)&out[i], res);
+ }
+ if (i != num_pixels) {
+ VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
+ VP8LTransformColor = TransformColor_SSE2;
+ VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
+ VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
+ VP8LAddVector = AddVector_SSE2;
+ VP8LAddVectorEq = AddVectorEq_SSE2;
+#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
+ VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
+#endif
+ VP8LVectorMismatch = VectorMismatch_SSE2;
+ VP8LBundleColorMap = BundleColorMap_SSE2;
+
+ VP8LPredictorsSub[0] = PredictorSub0_SSE2;
+ VP8LPredictorsSub[1] = PredictorSub1_SSE2;
+ VP8LPredictorsSub[2] = PredictorSub2_SSE2;
+ VP8LPredictorsSub[3] = PredictorSub3_SSE2;
+ VP8LPredictorsSub[4] = PredictorSub4_SSE2;
+ VP8LPredictorsSub[5] = PredictorSub5_SSE2;
+ VP8LPredictorsSub[6] = PredictorSub6_SSE2;
+ VP8LPredictorsSub[7] = PredictorSub7_SSE2;
+ VP8LPredictorsSub[8] = PredictorSub8_SSE2;
+ VP8LPredictorsSub[9] = PredictorSub9_SSE2;
+ VP8LPredictorsSub[10] = PredictorSub10_SSE2;
+ VP8LPredictorsSub[11] = PredictorSub11_SSE2;
+ VP8LPredictorsSub[12] = PredictorSub12_SSE2;
+ VP8LPredictorsSub[13] = PredictorSub13_SSE2;
+ VP8LPredictorsSub[14] = PredictorSub0_SSE2; // <- padding security sentinels
+ VP8LPredictorsSub[15] = PredictorSub0_SSE2;
+}
+
+#else // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)
+
+#endif // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/lossless_enc_sse41.c b/media/libwebp/dsp/lossless_enc_sse41.c
new file mode 100644
index 0000000000..2c6bc5bb00
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_sse41.c
@@ -0,0 +1,155 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4.1 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <assert.h>
+#include <smmintrin.h>
+#include "../dsp/lossless.h"
+
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
+ int num_pixels) {
+ int i;
+ const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
+ -1, 5, -1, 5, -1, 1, -1, 1);
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+ const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle);
+ const __m128i out = _mm_sub_epi8(in, in_0g0g);
+ _mm_storeu_si128((__m128i*)&argb_data[i], out);
+ }
+ // fallthrough and finish off with plain-C
+ if (i != num_pixels) {
+ VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+#define MK_CST_16(HI, LO) \
+ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
+static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_blue, int red_to_blue,
+ int histo[]) {
+ const __m128i mult =
+ MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
+ const __m128i perm =
+ _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14);
+ if (tile_width >= 4) {
+ int y;
+ for (y = 0; y < tile_height; ++y) {
+ const uint32_t* const src = argb + y * stride;
+ const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+ const __m128i B1 = _mm_shuffle_epi8(A1, perm);
+ const __m128i C1 = _mm_mulhi_epi16(B1, mult);
+ const __m128i D1 = _mm_sub_epi16(A1, C1);
+ __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1);
+ int x;
+ for (x = 4; x + 4 <= tile_width; x += 4) {
+ const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+ __m128i B2, C2, D2;
+ ++histo[_mm_extract_epi8(E, 0)];
+ B2 = _mm_shuffle_epi8(A2, perm);
+ ++histo[_mm_extract_epi8(E, 4)];
+ C2 = _mm_mulhi_epi16(B2, mult);
+ ++histo[_mm_extract_epi8(E, 8)];
+ D2 = _mm_sub_epi16(A2, C2);
+ ++histo[_mm_extract_epi8(E, 12)];
+ E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2);
+ }
+ ++histo[_mm_extract_epi8(E, 0)];
+ ++histo[_mm_extract_epi8(E, 4)];
+ ++histo[_mm_extract_epi8(E, 8)];
+ ++histo[_mm_extract_epi8(E, 12)];
+ }
+ }
+ {
+ const int left_over = tile_width & 3;
+ if (left_over > 0) {
+ VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+ left_over, tile_height,
+ green_to_blue, red_to_blue, histo);
+ }
+ }
+}
+
+static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_red, int histo[]) {
+
+ const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
+ const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
+ if (tile_width >= 4) {
+ int y;
+ for (y = 0; y < tile_height; ++y) {
+ const uint32_t* const src = argb + y * stride;
+ const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+ const __m128i B1 = _mm_and_si128(A1, mask_g);
+ const __m128i C1 = _mm_madd_epi16(B1, mult);
+ __m128i D = _mm_sub_epi16(A1, C1);
+ int x;
+ for (x = 4; x + 4 <= tile_width; x += 4) {
+ const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+ __m128i B2, C2;
+ ++histo[_mm_extract_epi8(D, 2)];
+ B2 = _mm_and_si128(A2, mask_g);
+ ++histo[_mm_extract_epi8(D, 6)];
+ C2 = _mm_madd_epi16(B2, mult);
+ ++histo[_mm_extract_epi8(D, 10)];
+ ++histo[_mm_extract_epi8(D, 14)];
+ D = _mm_sub_epi16(A2, C2);
+ }
+ ++histo[_mm_extract_epi8(D, 2)];
+ ++histo[_mm_extract_epi8(D, 6)];
+ ++histo[_mm_extract_epi8(D, 10)];
+ ++histo[_mm_extract_epi8(D, 14)];
+ }
+ }
+ {
+ const int left_over = tile_width & 3;
+ if (left_over > 0) {
+ VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+ left_over, tile_height, green_to_red,
+ histo);
+ }
+ }
+}
+
+#undef MK_CST_16
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
+ VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
+ VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
+}
+
+#else // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41)
+
+#endif // WEBP_USE_SSE41
diff --git a/media/libwebp/dsp/lossless_mips_dsp_r2.c b/media/libwebp/dsp/lossless_mips_dsp_r2.c
new file mode 100644
index 0000000000..ec98834f84
--- /dev/null
+++ b/media/libwebp/dsp/lossless_mips_dsp_r2.c
@@ -0,0 +1,701 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+
+#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE) \
+static void FUNC_NAME(const TYPE* src, \
+ const uint32_t* const color_map, \
+ TYPE* dst, int y_start, int y_end, \
+ int width) { \
+ int y; \
+ for (y = y_start; y < y_end; ++y) { \
+ int x; \
+ for (x = 0; x < (width >> 2); ++x) { \
+ int tmp1, tmp2, tmp3, tmp4; \
+ __asm__ volatile ( \
+ ".ifc " #TYPE ", uint8_t \n\t" \
+ "lbu %[tmp1], 0(%[src]) \n\t" \
+ "lbu %[tmp2], 1(%[src]) \n\t" \
+ "lbu %[tmp3], 2(%[src]) \n\t" \
+ "lbu %[tmp4], 3(%[src]) \n\t" \
+ "addiu %[src], %[src], 4 \n\t" \
+ ".endif \n\t" \
+ ".ifc " #TYPE ", uint32_t \n\t" \
+ "lw %[tmp1], 0(%[src]) \n\t" \
+ "lw %[tmp2], 4(%[src]) \n\t" \
+ "lw %[tmp3], 8(%[src]) \n\t" \
+ "lw %[tmp4], 12(%[src]) \n\t" \
+ "ext %[tmp1], %[tmp1], 8, 8 \n\t" \
+ "ext %[tmp2], %[tmp2], 8, 8 \n\t" \
+ "ext %[tmp3], %[tmp3], 8, 8 \n\t" \
+ "ext %[tmp4], %[tmp4], 8, 8 \n\t" \
+ "addiu %[src], %[src], 16 \n\t" \
+ ".endif \n\t" \
+ "sll %[tmp1], %[tmp1], 2 \n\t" \
+ "sll %[tmp2], %[tmp2], 2 \n\t" \
+ "sll %[tmp3], %[tmp3], 2 \n\t" \
+ "sll %[tmp4], %[tmp4], 2 \n\t" \
+ "lwx %[tmp1], %[tmp1](%[color_map]) \n\t" \
+ "lwx %[tmp2], %[tmp2](%[color_map]) \n\t" \
+ "lwx %[tmp3], %[tmp3](%[color_map]) \n\t" \
+ "lwx %[tmp4], %[tmp4](%[color_map]) \n\t" \
+ ".ifc " #TYPE ", uint8_t \n\t" \
+ "ext %[tmp1], %[tmp1], 8, 8 \n\t" \
+ "ext %[tmp2], %[tmp2], 8, 8 \n\t" \
+ "ext %[tmp3], %[tmp3], 8, 8 \n\t" \
+ "ext %[tmp4], %[tmp4], 8, 8 \n\t" \
+ "sb %[tmp1], 0(%[dst]) \n\t" \
+ "sb %[tmp2], 1(%[dst]) \n\t" \
+ "sb %[tmp3], 2(%[dst]) \n\t" \
+ "sb %[tmp4], 3(%[dst]) \n\t" \
+ "addiu %[dst], %[dst], 4 \n\t" \
+ ".endif \n\t" \
+ ".ifc " #TYPE ", uint32_t \n\t" \
+ "sw %[tmp1], 0(%[dst]) \n\t" \
+ "sw %[tmp2], 4(%[dst]) \n\t" \
+ "sw %[tmp3], 8(%[dst]) \n\t" \
+ "sw %[tmp4], 12(%[dst]) \n\t" \
+ "addiu %[dst], %[dst], 16 \n\t" \
+ ".endif \n\t" \
+ : [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), \
+ [tmp4]"=&r"(tmp4), [src]"+&r"(src), [dst]"+r"(dst) \
+ : [color_map]"r"(color_map) \
+ : "memory" \
+ ); \
+ } \
+ for (x = 0; x < (width & 3); ++x) { \
+ *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]); \
+ } \
+ } \
+}
+
+MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
+MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
+
+#undef MAP_COLOR_FUNCS
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+ uint32_t c2) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ __asm__ volatile (
+ "preceu.ph.qbr %[temp1], %[c0] \n\t"
+ "preceu.ph.qbl %[temp2], %[c0] \n\t"
+ "preceu.ph.qbr %[temp3], %[c1] \n\t"
+ "preceu.ph.qbl %[temp4], %[c1] \n\t"
+ "preceu.ph.qbr %[temp5], %[c2] \n\t"
+ "preceu.ph.qbl %[temp0], %[c2] \n\t"
+ "subq.ph %[temp3], %[temp3], %[temp5] \n\t"
+ "subq.ph %[temp4], %[temp4], %[temp0] \n\t"
+ "addq.ph %[temp1], %[temp1], %[temp3] \n\t"
+ "addq.ph %[temp2], %[temp2], %[temp4] \n\t"
+ "shll_s.ph %[temp1], %[temp1], 7 \n\t"
+ "shll_s.ph %[temp2], %[temp2], 7 \n\t"
+ "precrqu_s.qb.ph %[temp2], %[temp2], %[temp1] \n\t"
+ : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5)
+ : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
+ : "memory"
+ );
+ return temp2;
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+ uint32_t c2) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ __asm__ volatile (
+ "adduh.qb %[temp5], %[c0], %[c1] \n\t"
+ "preceu.ph.qbr %[temp3], %[c2] \n\t"
+ "preceu.ph.qbr %[temp1], %[temp5] \n\t"
+ "preceu.ph.qbl %[temp2], %[temp5] \n\t"
+ "preceu.ph.qbl %[temp4], %[c2] \n\t"
+ "subq.ph %[temp3], %[temp1], %[temp3] \n\t"
+ "subq.ph %[temp4], %[temp2], %[temp4] \n\t"
+ "shrl.ph %[temp5], %[temp3], 15 \n\t"
+ "shrl.ph %[temp0], %[temp4], 15 \n\t"
+ "addq.ph %[temp3], %[temp3], %[temp5] \n\t"
+ "addq.ph %[temp4], %[temp0], %[temp4] \n\t"
+ "shra.ph %[temp3], %[temp3], 1 \n\t"
+ "shra.ph %[temp4], %[temp4], 1 \n\t"
+ "addq.ph %[temp1], %[temp1], %[temp3] \n\t"
+ "addq.ph %[temp2], %[temp2], %[temp4] \n\t"
+ "shll_s.ph %[temp1], %[temp1], 7 \n\t"
+ "shll_s.ph %[temp2], %[temp2], 7 \n\t"
+ "precrqu_s.qb.ph %[temp1], %[temp2], %[temp1] \n\t"
+ : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=r"(temp4), [temp5]"=&r"(temp5)
+ : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
+ : "memory"
+ );
+ return temp1;
+}
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ __asm__ volatile (
+ "cmpgdu.lt.qb %[temp1], %[c], %[b] \n\t"
+ "pick.qb %[temp1], %[b], %[c] \n\t"
+ "pick.qb %[temp2], %[c], %[b] \n\t"
+ "cmpgdu.lt.qb %[temp4], %[c], %[a] \n\t"
+ "pick.qb %[temp4], %[a], %[c] \n\t"
+ "pick.qb %[temp5], %[c], %[a] \n\t"
+ "subu.qb %[temp3], %[temp1], %[temp2] \n\t"
+ "subu.qb %[temp0], %[temp4], %[temp5] \n\t"
+ "raddu.w.qb %[temp3], %[temp3] \n\t"
+ "raddu.w.qb %[temp0], %[temp0] \n\t"
+ "subu %[temp3], %[temp3], %[temp0] \n\t"
+ "slti %[temp0], %[temp3], 0x1 \n\t"
+ "movz %[a], %[b], %[temp0] \n\t"
+ : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp0]"=&r"(temp0),
+ [a]"+&r"(a)
+ : [b]"r"(b), [c]"r"(c)
+ );
+ return a;
+}
+
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+ __asm__ volatile (
+ "adduh.qb %[a0], %[a0], %[a1] \n\t"
+ : [a0]"+r"(a0)
+ : [a1]"r"(a1)
+ );
+ return a0;
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+ return Average2(Average2(a0, a2), a1);
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+ uint32_t a2, uint32_t a3) {
+ return Average2(Average2(a0, a1), Average2(a2, a3));
+}
+
+static uint32_t Predictor5_MIPSdspR2(const uint32_t* const left,
+ const uint32_t* const top) {
+ return Average3(*left, top[0], top[1]);
+}
+
+static uint32_t Predictor6_MIPSdspR2(const uint32_t* const left,
+ const uint32_t* const top) {
+ return Average2(*left, top[-1]);
+}
+
+static uint32_t Predictor7_MIPSdspR2(const uint32_t* const left,
+ const uint32_t* const top) {
+ return Average2(*left, top[0]);
+}
+
+static uint32_t Predictor8_MIPSdspR2(const uint32_t* const left,
+ const uint32_t* const top) {
+ (void)left;
+ return Average2(top[-1], top[0]);
+}
+
+static uint32_t Predictor9_MIPSdspR2(const uint32_t* const left,
+ const uint32_t* const top) {
+ (void)left;
+ return Average2(top[0], top[1]);
+}
+
+static uint32_t Predictor10_MIPSdspR2(const uint32_t* const left,
+ const uint32_t* const top) {
+ return Average4(*left, top[-1], top[0], top[1]);
+}
+
+static uint32_t Predictor11_MIPSdspR2(const uint32_t* const left,
+ const uint32_t* const top) {
+ return Select(top[0], *left, top[-1]);
+}
+
+static uint32_t Predictor12_MIPSdspR2(const uint32_t* const left,
+ const uint32_t* const top) {
+ return ClampedAddSubtractFull(*left, top[0], top[-1]);
+}
+
+static uint32_t Predictor13_MIPSdspR2(const uint32_t* const left,
+ const uint32_t* const top) {
+ return ClampedAddSubtractHalf(*left, top[0], top[-1]);
+}
+
+// Add green to blue and red channels (i.e. perform the inverse transform of
+// 'subtract green').
+static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
+ uint32_t* dst) {
+ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+ const uint32_t* const p_loop2_end = src + num_pixels;
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beq %[src], %[p_loop1_end], 3f \n\t"
+ " nop \n\t"
+ "0: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "lw %[temp1], 4(%[src]) \n\t"
+ "lw %[temp2], 8(%[src]) \n\t"
+ "lw %[temp3], 12(%[src]) \n\t"
+ "ext %[temp4], %[temp0], 8, 8 \n\t"
+ "ext %[temp5], %[temp1], 8, 8 \n\t"
+ "ext %[temp6], %[temp2], 8, 8 \n\t"
+ "ext %[temp7], %[temp3], 8, 8 \n\t"
+ "addiu %[src], %[src], 16 \n\t"
+ "addiu %[dst], %[dst], 16 \n\t"
+ "replv.ph %[temp4], %[temp4] \n\t"
+ "replv.ph %[temp5], %[temp5] \n\t"
+ "replv.ph %[temp6], %[temp6] \n\t"
+ "replv.ph %[temp7], %[temp7] \n\t"
+ "addu.qb %[temp0], %[temp0], %[temp4] \n\t"
+ "addu.qb %[temp1], %[temp1], %[temp5] \n\t"
+ "addu.qb %[temp2], %[temp2], %[temp6] \n\t"
+ "addu.qb %[temp3], %[temp3], %[temp7] \n\t"
+ "sw %[temp0], -16(%[dst]) \n\t"
+ "sw %[temp1], -12(%[dst]) \n\t"
+ "sw %[temp2], -8(%[dst]) \n\t"
+ "bne %[src], %[p_loop1_end], 0b \n\t"
+ " sw %[temp3], -4(%[dst]) \n\t"
+ "3: \n\t"
+ "beq %[src], %[p_loop2_end], 2f \n\t"
+ " nop \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "addiu %[src], %[src], 4 \n\t"
+ "addiu %[dst], %[dst], 4 \n\t"
+ "ext %[temp4], %[temp0], 8, 8 \n\t"
+ "replv.ph %[temp4], %[temp4] \n\t"
+ "addu.qb %[temp0], %[temp0], %[temp4] \n\t"
+ "bne %[src], %[p_loop2_end], 1b \n\t"
+ " sw %[temp0], -4(%[dst]) \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [dst]"+&r"(dst), [src]"+&r"(src), [temp0]"=&r"(temp0),
+ [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+ [temp7]"=&r"(temp7)
+ : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+ : "memory"
+ );
+}
+
+static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
+ const uint32_t* src, int num_pixels,
+ uint32_t* dst) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ uint32_t argb, argb1, new_red;
+ const uint32_t G_to_R = m->green_to_red_;
+ const uint32_t G_to_B = m->green_to_blue_;
+ const uint32_t R_to_B = m->red_to_blue_;
+ const uint32_t* const p_loop_end = src + (num_pixels & ~1);
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beq %[src], %[p_loop_end], 1f \n\t"
+ " nop \n\t"
+ "replv.ph %[temp0], %[G_to_R] \n\t"
+ "replv.ph %[temp1], %[G_to_B] \n\t"
+ "replv.ph %[temp2], %[R_to_B] \n\t"
+ "shll.ph %[temp0], %[temp0], 8 \n\t"
+ "shll.ph %[temp1], %[temp1], 8 \n\t"
+ "shll.ph %[temp2], %[temp2], 8 \n\t"
+ "shra.ph %[temp0], %[temp0], 8 \n\t"
+ "shra.ph %[temp1], %[temp1], 8 \n\t"
+ "shra.ph %[temp2], %[temp2], 8 \n\t"
+ "0: \n\t"
+ "lw %[argb], 0(%[src]) \n\t"
+ "lw %[argb1], 4(%[src]) \n\t"
+ "sw %[argb], 0(%[dst]) \n\t"
+ "sw %[argb1], 4(%[dst]) \n\t"
+ "addiu %[src], %[src], 8 \n\t"
+ "addiu %[dst], %[dst], 8 \n\t"
+ "precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t"
+ "preceu.ph.qbra %[temp3], %[temp3] \n\t"
+ "shll.ph %[temp3], %[temp3], 8 \n\t"
+ "shra.ph %[temp3], %[temp3], 8 \n\t"
+ "mul.ph %[temp5], %[temp3], %[temp0] \n\t"
+ "mul.ph %[temp3], %[temp3], %[temp1] \n\t"
+ "precrq.ph.w %[new_red], %[argb], %[argb1] \n\t"
+ "ins %[argb1], %[argb], 16, 16 \n\t"
+ "shra.ph %[temp5], %[temp5], 5 \n\t"
+ "shra.ph %[temp3], %[temp3], 5 \n\t"
+ "addu.ph %[new_red], %[new_red], %[temp5] \n\t"
+ "addu.ph %[argb1], %[argb1], %[temp3] \n\t"
+ "preceu.ph.qbra %[temp5], %[new_red] \n\t"
+ "shll.ph %[temp4], %[temp5], 8 \n\t"
+ "shra.ph %[temp4], %[temp4], 8 \n\t"
+ "mul.ph %[temp4], %[temp4], %[temp2] \n\t"
+ "sb %[temp5], -2(%[dst]) \n\t"
+ "sra %[temp5], %[temp5], 16 \n\t"
+ "shra.ph %[temp4], %[temp4], 5 \n\t"
+ "addu.ph %[argb1], %[argb1], %[temp4] \n\t"
+ "preceu.ph.qbra %[temp3], %[argb1] \n\t"
+ "sb %[temp5], -6(%[dst]) \n\t"
+ "sb %[temp3], -4(%[dst]) \n\t"
+ "sra %[temp3], %[temp3], 16 \n\t"
+ "bne %[src], %[p_loop_end], 0b \n\t"
+ " sb %[temp3], -8(%[dst]) \n\t"
+ "1: \n\t"
+ ".set pop \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [new_red]"=&r"(new_red), [argb]"=&r"(argb),
+ [argb1]"=&r"(argb1), [dst]"+&r"(dst), [src]"+&r"(src)
+ : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+ [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+ : "memory", "hi", "lo"
+ );
+
+ // Fall-back to C-version for left-overs.
+ if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
+}
+
+static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ int temp0, temp1, temp2, temp3;
+ const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+ const uint32_t* const p_loop2_end = src + num_pixels;
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beq %[src], %[p_loop1_end], 3f \n\t"
+ " nop \n\t"
+ "0: \n\t"
+ "lw %[temp3], 12(%[src]) \n\t"
+ "lw %[temp2], 8(%[src]) \n\t"
+ "lw %[temp1], 4(%[src]) \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "ins %[temp3], %[temp2], 24, 8 \n\t"
+ "sll %[temp2], %[temp2], 8 \n\t"
+ "rotr %[temp3], %[temp3], 16 \n\t"
+ "ins %[temp2], %[temp1], 0, 16 \n\t"
+ "sll %[temp1], %[temp1], 8 \n\t"
+ "wsbh %[temp3], %[temp3] \n\t"
+ "balign %[temp0], %[temp1], 1 \n\t"
+ "wsbh %[temp2], %[temp2] \n\t"
+ "wsbh %[temp0], %[temp0] \n\t"
+ "usw %[temp3], 8(%[dst]) \n\t"
+ "rotr %[temp0], %[temp0], 16 \n\t"
+ "usw %[temp2], 4(%[dst]) \n\t"
+ "addiu %[src], %[src], 16 \n\t"
+ "usw %[temp0], 0(%[dst]) \n\t"
+ "bne %[src], %[p_loop1_end], 0b \n\t"
+ " addiu %[dst], %[dst], 12 \n\t"
+ "3: \n\t"
+ "beq %[src], %[p_loop2_end], 2f \n\t"
+ " nop \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "addiu %[src], %[src], 4 \n\t"
+ "wsbh %[temp1], %[temp0] \n\t"
+ "addiu %[dst], %[dst], 3 \n\t"
+ "ush %[temp1], -2(%[dst]) \n\t"
+ "sra %[temp0], %[temp0], 16 \n\t"
+ "bne %[src], %[p_loop2_end], 1b \n\t"
+ " sb %[temp0], -3(%[dst]) \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+ : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+ : "memory"
+ );
+}
+
+static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ int temp0, temp1, temp2, temp3;
+ const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+ const uint32_t* const p_loop2_end = src + num_pixels;
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beq %[src], %[p_loop1_end], 3f \n\t"
+ " nop \n\t"
+ "0: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "lw %[temp1], 4(%[src]) \n\t"
+ "lw %[temp2], 8(%[src]) \n\t"
+ "lw %[temp3], 12(%[src]) \n\t"
+ "wsbh %[temp0], %[temp0] \n\t"
+ "wsbh %[temp1], %[temp1] \n\t"
+ "wsbh %[temp2], %[temp2] \n\t"
+ "wsbh %[temp3], %[temp3] \n\t"
+ "addiu %[src], %[src], 16 \n\t"
+ "balign %[temp0], %[temp0], 1 \n\t"
+ "balign %[temp1], %[temp1], 1 \n\t"
+ "balign %[temp2], %[temp2], 1 \n\t"
+ "balign %[temp3], %[temp3], 1 \n\t"
+ "usw %[temp0], 0(%[dst]) \n\t"
+ "usw %[temp1], 4(%[dst]) \n\t"
+ "usw %[temp2], 8(%[dst]) \n\t"
+ "usw %[temp3], 12(%[dst]) \n\t"
+ "bne %[src], %[p_loop1_end], 0b \n\t"
+ " addiu %[dst], %[dst], 16 \n\t"
+ "3: \n\t"
+ "beq %[src], %[p_loop2_end], 2f \n\t"
+ " nop \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "wsbh %[temp0], %[temp0] \n\t"
+ "addiu %[src], %[src], 4 \n\t"
+ "balign %[temp0], %[temp0], 1 \n\t"
+ "usw %[temp0], 0(%[dst]) \n\t"
+ "bne %[src], %[p_loop2_end], 1b \n\t"
+ " addiu %[dst], %[dst], 4 \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+ : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+ : "memory"
+ );
+}
+
+static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+ const uint32_t* const p_loop2_end = src + num_pixels;
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beq %[src], %[p_loop1_end], 3f \n\t"
+ " nop \n\t"
+ "0: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "lw %[temp1], 4(%[src]) \n\t"
+ "lw %[temp2], 8(%[src]) \n\t"
+ "lw %[temp3], 12(%[src]) \n\t"
+ "ext %[temp4], %[temp0], 28, 4 \n\t"
+ "ext %[temp5], %[temp0], 12, 4 \n\t"
+ "ins %[temp0], %[temp4], 0, 4 \n\t"
+ "ext %[temp4], %[temp1], 28, 4 \n\t"
+ "ins %[temp0], %[temp5], 16, 4 \n\t"
+ "ext %[temp5], %[temp1], 12, 4 \n\t"
+ "ins %[temp1], %[temp4], 0, 4 \n\t"
+ "ext %[temp4], %[temp2], 28, 4 \n\t"
+ "ins %[temp1], %[temp5], 16, 4 \n\t"
+ "ext %[temp5], %[temp2], 12, 4 \n\t"
+ "ins %[temp2], %[temp4], 0, 4 \n\t"
+ "ext %[temp4], %[temp3], 28, 4 \n\t"
+ "ins %[temp2], %[temp5], 16, 4 \n\t"
+ "ext %[temp5], %[temp3], 12, 4 \n\t"
+ "ins %[temp3], %[temp4], 0, 4 \n\t"
+ "precr.qb.ph %[temp1], %[temp1], %[temp0] \n\t"
+ "ins %[temp3], %[temp5], 16, 4 \n\t"
+ "addiu %[src], %[src], 16 \n\t"
+ "precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ "usw %[temp1], 0(%[dst]) \n\t"
+ "usw %[temp3], 4(%[dst]) \n\t"
+#else
+ "wsbh %[temp1], %[temp1] \n\t"
+ "wsbh %[temp3], %[temp3] \n\t"
+ "usw %[temp1], 0(%[dst]) \n\t"
+ "usw %[temp3], 4(%[dst]) \n\t"
+#endif
+ "bne %[src], %[p_loop1_end], 0b \n\t"
+ " addiu %[dst], %[dst], 8 \n\t"
+ "3: \n\t"
+ "beq %[src], %[p_loop2_end], 2f \n\t"
+ " nop \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "ext %[temp4], %[temp0], 28, 4 \n\t"
+ "ext %[temp5], %[temp0], 12, 4 \n\t"
+ "ins %[temp0], %[temp4], 0, 4 \n\t"
+ "ins %[temp0], %[temp5], 16, 4 \n\t"
+ "addiu %[src], %[src], 4 \n\t"
+ "precr.qb.ph %[temp0], %[temp0], %[temp0] \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ "ush %[temp0], 0(%[dst]) \n\t"
+#else
+ "wsbh %[temp0], %[temp0] \n\t"
+ "ush %[temp0], 0(%[dst]) \n\t"
+#endif
+ "bne %[src], %[p_loop2_end], 1b \n\t"
+ " addiu %[dst], %[dst], 2 \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [dst]"+&r"(dst), [src]"+&r"(src)
+ : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+ : "memory"
+ );
+}
+
+static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ int temp0, temp1, temp2, temp3, temp4, temp5;
+ const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+ const uint32_t* const p_loop2_end = src + num_pixels;
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beq %[src], %[p_loop1_end], 3f \n\t"
+ " nop \n\t"
+ "0: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "lw %[temp1], 4(%[src]) \n\t"
+ "lw %[temp2], 8(%[src]) \n\t"
+ "lw %[temp3], 12(%[src]) \n\t"
+ "ext %[temp4], %[temp0], 8, 16 \n\t"
+ "ext %[temp5], %[temp0], 5, 11 \n\t"
+ "ext %[temp0], %[temp0], 3, 5 \n\t"
+ "ins %[temp4], %[temp5], 0, 11 \n\t"
+ "ext %[temp5], %[temp1], 5, 11 \n\t"
+ "ins %[temp4], %[temp0], 0, 5 \n\t"
+ "ext %[temp0], %[temp1], 8, 16 \n\t"
+ "ext %[temp1], %[temp1], 3, 5 \n\t"
+ "ins %[temp0], %[temp5], 0, 11 \n\t"
+ "ext %[temp5], %[temp2], 5, 11 \n\t"
+ "ins %[temp0], %[temp1], 0, 5 \n\t"
+ "ext %[temp1], %[temp2], 8, 16 \n\t"
+ "ext %[temp2], %[temp2], 3, 5 \n\t"
+ "ins %[temp1], %[temp5], 0, 11 \n\t"
+ "ext %[temp5], %[temp3], 5, 11 \n\t"
+ "ins %[temp1], %[temp2], 0, 5 \n\t"
+ "ext %[temp2], %[temp3], 8, 16 \n\t"
+ "ext %[temp3], %[temp3], 3, 5 \n\t"
+ "ins %[temp2], %[temp5], 0, 11 \n\t"
+ "append %[temp0], %[temp4], 16 \n\t"
+ "ins %[temp2], %[temp3], 0, 5 \n\t"
+ "addiu %[src], %[src], 16 \n\t"
+ "append %[temp2], %[temp1], 16 \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ "usw %[temp0], 0(%[dst]) \n\t"
+ "usw %[temp2], 4(%[dst]) \n\t"
+#else
+ "wsbh %[temp0], %[temp0] \n\t"
+ "wsbh %[temp2], %[temp2] \n\t"
+ "usw %[temp0], 0(%[dst]) \n\t"
+ "usw %[temp2], 4(%[dst]) \n\t"
+#endif
+ "bne %[src], %[p_loop1_end], 0b \n\t"
+ " addiu %[dst], %[dst], 8 \n\t"
+ "3: \n\t"
+ "beq %[src], %[p_loop2_end], 2f \n\t"
+ " nop \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "ext %[temp4], %[temp0], 8, 16 \n\t"
+ "ext %[temp5], %[temp0], 5, 11 \n\t"
+ "ext %[temp0], %[temp0], 3, 5 \n\t"
+ "ins %[temp4], %[temp5], 0, 11 \n\t"
+ "addiu %[src], %[src], 4 \n\t"
+ "ins %[temp4], %[temp0], 0, 5 \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ "ush %[temp4], 0(%[dst]) \n\t"
+#else
+ "wsbh %[temp4], %[temp4] \n\t"
+ "ush %[temp4], 0(%[dst]) \n\t"
+#endif
+ "bne %[src], %[p_loop2_end], 1b \n\t"
+ " addiu %[dst], %[dst], 2 \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+ [dst]"+&r"(dst), [src]"+&r"(src)
+ : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+ : "memory"
+ );
+}
+
+static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ int temp0, temp1, temp2, temp3;
+ const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+ const uint32_t* const p_loop2_end = src + num_pixels;
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beq %[src], %[p_loop1_end], 3f \n\t"
+ " nop \n\t"
+ "0: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "lw %[temp1], 4(%[src]) \n\t"
+ "lw %[temp2], 8(%[src]) \n\t"
+ "lw %[temp3], 12(%[src]) \n\t"
+ "ins %[temp0], %[temp1], 24, 8 \n\t"
+ "sra %[temp1], %[temp1], 8 \n\t"
+ "ins %[temp1], %[temp2], 16, 16 \n\t"
+ "sll %[temp2], %[temp2], 8 \n\t"
+ "balign %[temp3], %[temp2], 1 \n\t"
+ "addiu %[src], %[src], 16 \n\t"
+ "usw %[temp0], 0(%[dst]) \n\t"
+ "usw %[temp1], 4(%[dst]) \n\t"
+ "usw %[temp3], 8(%[dst]) \n\t"
+ "bne %[src], %[p_loop1_end], 0b \n\t"
+ " addiu %[dst], %[dst], 12 \n\t"
+ "3: \n\t"
+ "beq %[src], %[p_loop2_end], 2f \n\t"
+ " nop \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[src]) \n\t"
+ "addiu %[src], %[src], 4 \n\t"
+ "addiu %[dst], %[dst], 3 \n\t"
+ "ush %[temp0], -3(%[dst]) \n\t"
+ "sra %[temp0], %[temp0], 16 \n\t"
+ "bne %[src], %[p_loop2_end], 1b \n\t"
+ " sb %[temp0], -1(%[dst]) \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+ : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+ : "memory"
+ );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
+ VP8LMapColor32b = MapARGB_MIPSdspR2;
+ VP8LMapColor8b = MapAlpha_MIPSdspR2;
+
+ VP8LPredictors[5] = Predictor5_MIPSdspR2;
+ VP8LPredictors[6] = Predictor6_MIPSdspR2;
+ VP8LPredictors[7] = Predictor7_MIPSdspR2;
+ VP8LPredictors[8] = Predictor8_MIPSdspR2;
+ VP8LPredictors[9] = Predictor9_MIPSdspR2;
+ VP8LPredictors[10] = Predictor10_MIPSdspR2;
+ VP8LPredictors[11] = Predictor11_MIPSdspR2;
+ VP8LPredictors[12] = Predictor12_MIPSdspR2;
+ VP8LPredictors[13] = Predictor13_MIPSdspR2;
+
+ VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
+ VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
+
+ VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
+ VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
+ VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
+ VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
+ VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8LDspInitMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/lossless_msa.c b/media/libwebp/dsp/lossless_msa.c
new file mode 100644
index 0000000000..16256ab57f
--- /dev/null
+++ b/media/libwebp/dsp/lossless_msa.c
@@ -0,0 +1,356 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of methods for lossless decoder
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/lossless.h"
+#include "../dsp/msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Colorspace conversion functions
+
+#define CONVERT16_BGRA_XXX(psrc, pdst, m0, m1, m2) do { \
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2; \
+ LD_UB4(psrc, 16, src0, src1, src2, src3); \
+ VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
+ dst2 = VSHF_UB(src2, src3, m2); \
+ ST_UB2(dst0, dst1, pdst, 16); \
+ ST_UB(dst2, pdst + 32); \
+} while (0)
+
+#define CONVERT12_BGRA_XXX(psrc, pdst, m0, m1, m2) do { \
+ uint32_t pix_w; \
+ v16u8 src0, src1, src2, dst0, dst1, dst2; \
+ LD_UB3(psrc, 16, src0, src1, src2); \
+ VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
+ dst2 = VSHF_UB(src2, src2, m2); \
+ ST_UB2(dst0, dst1, pdst, 16); \
+ pix_w = __msa_copy_s_w((v4i32)dst2, 0); \
+ SW(pix_w, pdst + 32); \
+} while (0)
+
+#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do { \
+ uint64_t pix_d; \
+ v16u8 src0, src1, src2 = { 0 }, dst0, dst1; \
+ LD_UB2(psrc, 16, src0, src1); \
+ VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
+ ST_UB(dst0, pdst); \
+ pix_d = __msa_copy_s_d((v2i64)dst1, 0); \
+ SD(pix_d, pdst + 16); \
+} while (0)
+
+#define CONVERT4_BGRA_XXX(psrc, pdst, m) do { \
+ const v16u8 src0 = LD_UB(psrc); \
+ const v16u8 dst0 = VSHF_UB(src0, src0, m); \
+ uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); \
+ uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2); \
+ SD(pix_d, pdst + 0); \
+ SW(pix_w, pdst + 8); \
+} while (0)
+
+#define CONVERT1_BGRA_BGR(psrc, pdst) do { \
+ const int32_t b = (psrc)[0]; \
+ const int32_t g = (psrc)[1]; \
+ const int32_t r = (psrc)[2]; \
+ (pdst)[0] = b; \
+ (pdst)[1] = g; \
+ (pdst)[2] = r; \
+} while (0)
+
+#define CONVERT1_BGRA_RGB(psrc, pdst) do { \
+ const int32_t b = (psrc)[0]; \
+ const int32_t g = (psrc)[1]; \
+ const int32_t r = (psrc)[2]; \
+ (pdst)[0] = r; \
+ (pdst)[1] = g; \
+ (pdst)[2] = b; \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, \
+ c0, c1, mask0, mask1) do { \
+ v8i16 g0, g1, t0, t1, t2, t3; \
+ v4i32 t4, t5; \
+ VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1); \
+ DOTP_SB2_SH(g0, g1, c0, c0, t0, t1); \
+ SRAI_H2_SH(t0, t1, 5); \
+ t0 = __msa_addv_h(t0, (v8i16)src0); \
+ t1 = __msa_addv_h(t1, (v8i16)src1); \
+ t4 = __msa_srli_w((v4i32)t0, 16); \
+ t5 = __msa_srli_w((v4i32)t1, 16); \
+ DOTP_SB2_SH(t4, t5, c1, c1, t2, t3); \
+ SRAI_H2_SH(t2, t3, 5); \
+ ADD2(t0, t2, t1, t3, t0, t1); \
+ VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1); \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_4(src, dst, c0, c1, mask0, mask1) do { \
+ const v16i8 g0 = VSHF_SB(src, src, mask0); \
+ v8i16 t0 = __msa_dotp_s_h(c0, g0); \
+ v8i16 t1; \
+ v4i32 t2; \
+ t0 = SRAI_H(t0, 5); \
+ t0 = __msa_addv_h(t0, (v8i16)src); \
+ t2 = __msa_srli_w((v4i32)t0, 16); \
+ t1 = __msa_dotp_s_h(c1, (v16i8)t2); \
+ t1 = SRAI_H(t1, 5); \
+ t0 = t0 + t1; \
+ dst = VSHF_UB(src, t0, mask1); \
+} while (0)
+
+static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ int i;
+ const uint8_t* ptemp_src = (const uint8_t*)src;
+ uint8_t* ptemp_dst = (uint8_t*)dst;
+ v16u8 src0, dst0;
+ const v16u8 mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 };
+
+ while (num_pixels >= 8) {
+ v16u8 src1, dst1;
+ LD_UB2(ptemp_src, 16, src0, src1);
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, dst0, dst1);
+ ST_UB2(dst0, dst1, ptemp_dst, 16);
+ ptemp_src += 32;
+ ptemp_dst += 32;
+ num_pixels -= 8;
+ }
+ if (num_pixels > 0) {
+ if (num_pixels >= 4) {
+ src0 = LD_UB(ptemp_src);
+ dst0 = VSHF_UB(src0, src0, mask);
+ ST_UB(dst0, ptemp_dst);
+ ptemp_src += 16;
+ ptemp_dst += 16;
+ num_pixels -= 4;
+ }
+ for (i = 0; i < num_pixels; i++) {
+ const uint8_t b = ptemp_src[2];
+ const uint8_t g = ptemp_src[1];
+ const uint8_t r = ptemp_src[0];
+ const uint8_t a = ptemp_src[3];
+ ptemp_dst[0] = b;
+ ptemp_dst[1] = g;
+ ptemp_dst[2] = r;
+ ptemp_dst[3] = a;
+ ptemp_src += 4;
+ ptemp_dst += 4;
+ }
+ }
+}
+
+static void ConvertBGRAToBGR_MSA(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ const uint8_t* ptemp_src = (const uint8_t*)src;
+ uint8_t* ptemp_dst = (uint8_t*)dst;
+ const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
+ 16, 17, 18, 20 };
+ const v16u8 mask1 = { 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20,
+ 21, 22, 24, 25 };
+ const v16u8 mask2 = { 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25,
+ 26, 28, 29, 30 };
+
+ while (num_pixels >= 16) {
+ CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+ ptemp_src += 64;
+ ptemp_dst += 48;
+ num_pixels -= 16;
+ }
+ if (num_pixels > 0) {
+ if (num_pixels >= 12) {
+ CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+ ptemp_src += 48;
+ ptemp_dst += 36;
+ num_pixels -= 12;
+ } else if (num_pixels >= 8) {
+ CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+ ptemp_src += 32;
+ ptemp_dst += 24;
+ num_pixels -= 8;
+ } else if (num_pixels >= 4) {
+ CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+ ptemp_src += 16;
+ ptemp_dst += 12;
+ num_pixels -= 4;
+ }
+ if (num_pixels == 3) {
+ CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+ CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+ CONVERT1_BGRA_BGR(ptemp_src + 8, ptemp_dst + 6);
+ } else if (num_pixels == 2) {
+ CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+ CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+ } else if (num_pixels == 1) {
+ CONVERT1_BGRA_BGR(ptemp_src, ptemp_dst);
+ }
+ }
+}
+
+static void ConvertBGRAToRGB_MSA(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ const uint8_t* ptemp_src = (const uint8_t*)src;
+ uint8_t* ptemp_dst = (uint8_t*)dst;
+ const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
+ 18, 17, 16, 22 };
+ const v16u8 mask1 = { 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22,
+ 21, 20, 26, 25 };
+ const v16u8 mask2 = { 8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25,
+ 24, 30, 29, 28 };
+
+ while (num_pixels >= 16) {
+ CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+ ptemp_src += 64;
+ ptemp_dst += 48;
+ num_pixels -= 16;
+ }
+ if (num_pixels) {
+ if (num_pixels >= 12) {
+ CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+ ptemp_src += 48;
+ ptemp_dst += 36;
+ num_pixels -= 12;
+ } else if (num_pixels >= 8) {
+ CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+ ptemp_src += 32;
+ ptemp_dst += 24;
+ num_pixels -= 8;
+ } else if (num_pixels >= 4) {
+ CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+ ptemp_src += 16;
+ ptemp_dst += 12;
+ num_pixels -= 4;
+ }
+ if (num_pixels == 3) {
+ CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+ CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+ CONVERT1_BGRA_RGB(ptemp_src + 8, ptemp_dst + 6);
+ } else if (num_pixels == 2) {
+ CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+ CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+ } else if (num_pixels == 1) {
+ CONVERT1_BGRA_RGB(ptemp_src, ptemp_dst);
+ }
+ }
+}
+
+static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
+ uint32_t* dst) {
+ int i;
+ const uint8_t* in = (const uint8_t*)src;
+ uint8_t* out = (uint8_t*)dst;
+ v16u8 src0, dst0, tmp0;
+ const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+ 13, 255, 13, 255 };
+
+ while (num_pixels >= 8) {
+ v16u8 src1, dst1, tmp1;
+ LD_UB2(in, 16, src0, src1);
+ VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+ ADD2(src0, tmp0, src1, tmp1, dst0, dst1);
+ ST_UB2(dst0, dst1, out, 16);
+ in += 32;
+ out += 32;
+ num_pixels -= 8;
+ }
+ if (num_pixels > 0) {
+ if (num_pixels >= 4) {
+ src0 = LD_UB(in);
+ tmp0 = VSHF_UB(src0, src0, mask);
+ dst0 = src0 + tmp0;
+ ST_UB(dst0, out);
+ in += 16;
+ out += 16;
+ num_pixels -= 4;
+ }
+ for (i = 0; i < num_pixels; i++) {
+ const uint8_t b = in[0];
+ const uint8_t g = in[1];
+ const uint8_t r = in[2];
+ out[0] = (b + g) & 0xff;
+ out[1] = g;
+ out[2] = (r + g) & 0xff;
+ out[4] = in[4];
+ out += 4;
+ }
+ }
+}
+
+static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
+ const uint32_t* src, int num_pixels,
+ uint32_t* dst) {
+ v16u8 src0, dst0;
+ const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+ (m->green_to_red_ << 16));
+ const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+ const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+ 13, 255, 13, 255 };
+ const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+ 28, 13, 30, 15 };
+
+ while (num_pixels >= 8) {
+ v16u8 src1, dst1;
+ LD_UB2(src, 4, src0, src1);
+ TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+ ST_UB2(dst0, dst1, dst, 4);
+ src += 8;
+ dst += 8;
+ num_pixels -= 8;
+ }
+ if (num_pixels > 0) {
+ if (num_pixels >= 4) {
+ src0 = LD_UB(src);
+ TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+ ST_UB(dst0, dst);
+ src += 4;
+ dst += 4;
+ num_pixels -= 4;
+ }
+ if (num_pixels > 0) {
+ src0 = LD_UB(src);
+ TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+ if (num_pixels == 3) {
+ const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+ const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+ SD(pix_d, dst + 0);
+ SW(pix_w, dst + 2);
+ } else if (num_pixels == 2) {
+ const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+ SD(pix_d, dst);
+ } else {
+ const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+ SW(pix_w, dst);
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
+ VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
+ VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
+ VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
+
+ VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
+ VP8LTransformColorInverse = TransformColorInverse_MSA;
+}
+
+#else // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LDspInitMSA)
+
+#endif // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/lossless_neon.c b/media/libwebp/dsp/lossless_neon.c
index a7bf47f3c4..2122e46f7a 100644
--- a/media/libwebp/dsp/lossless_neon.c
+++ b/media/libwebp/dsp/lossless_neon.c
@@ -188,17 +188,21 @@ static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
return avg;
}
-static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
- return Average3_NEON(left, top[0], top[1]);
+static uint32_t Predictor5_NEON(const uint32_t* const left,
+ const uint32_t* const top) {
+ return Average3_NEON(*left, top[0], top[1]);
}
-static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
- return Average2_NEON(left, top[-1]);
+static uint32_t Predictor6_NEON(const uint32_t* const left,
+ const uint32_t* const top) {
+ return Average2_NEON(*left, top[-1]);
}
-static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
- return Average2_NEON(left, top[0]);
+static uint32_t Predictor7_NEON(const uint32_t* const left,
+ const uint32_t* const top) {
+ return Average2_NEON(*left, top[0]);
}
-static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
- return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
+static uint32_t Predictor13_NEON(const uint32_t* const left,
+ const uint32_t* const top) {
+ return ClampedAddSubtractHalf_NEON(*left, top[0], top[-1]);
}
// Batch versions of those functions.
diff --git a/media/libwebp/dsp/lossless_sse2.c b/media/libwebp/dsp/lossless_sse2.c
index c40fcfb769..03796493de 100644
--- a/media/libwebp/dsp/lossless_sse2.c
+++ b/media/libwebp/dsp/lossless_sse2.c
@@ -18,7 +18,6 @@
#include "../dsp/common_sse2.h"
#include "../dsp/lossless.h"
#include "../dsp/lossless_common.h"
-#include <assert.h>
#include <emmintrin.h>
//------------------------------------------------------------------------------
@@ -139,42 +138,51 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
return output;
}
-static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
+static uint32_t Predictor5_SSE2(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
return pred;
}
-static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average2_SSE2(left, top[-1]);
+static uint32_t Predictor6_SSE2(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Average2_SSE2(*left, top[-1]);
return pred;
}
-static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average2_SSE2(left, top[0]);
+static uint32_t Predictor7_SSE2(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Average2_SSE2(*left, top[0]);
return pred;
}
-static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_SSE2(const uint32_t* const left,
+ const uint32_t* const top) {
const uint32_t pred = Average2_SSE2(top[-1], top[0]);
(void)left;
return pred;
}
-static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_SSE2(const uint32_t* const left,
+ const uint32_t* const top) {
const uint32_t pred = Average2_SSE2(top[0], top[1]);
(void)left;
return pred;
}
-static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
+static uint32_t Predictor10_SSE2(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
return pred;
}
-static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
+static uint32_t Predictor11_SSE2(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
return pred;
}
-static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor12_SSE2(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
return pred;
}
-static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor13_SSE2(const uint32_t* const left,
+ const uint32_t* const top) {
+ const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
return pred;
}
@@ -191,8 +199,9 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
- VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
+ VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
}
+ (void)upper;
}
// Predictor1: left.
diff --git a/media/libwebp/dsp/lossless_sse41.c b/media/libwebp/dsp/lossless_sse41.c
new file mode 100644
index 0000000000..3308ac31ee
--- /dev/null
+++ b/media/libwebp/dsp/lossless_sse41.c
@@ -0,0 +1,132 @@
+// Copyright 2021 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE41 variant of methods for lossless decoder
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include "../dsp/common_sse41.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+
+//------------------------------------------------------------------------------
+// Color-space conversion functions
+
+static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
+ const uint32_t* const src,
+ int num_pixels, uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 5.
+#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
+ const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 |
+ (CST(green_to_blue_) & 0xffff));
+ const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
+#undef CST
+ const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);
+ const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
+ -1, 9, -1, 9, -1, 13, -1, 13);
+ const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
+ -1, 10, -1, -1, -1, 14, -1, -1);
+ int i;
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
+ const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
+ const __m128i C = _mm_mulhi_epi16(B, mults_rb);
+ const __m128i D = _mm_add_epi8(A, C);
+ const __m128i E = _mm_shuffle_epi8(D, perm2);
+ const __m128i F = _mm_mulhi_epi16(E, mults_b2);
+ const __m128i G = _mm_add_epi8(D, F);
+ const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
+ _mm_storeu_si128((__m128i*)&dst[i], out);
+ }
+ // Fall-back to C-version for left-overs.
+ if (i != num_pixels) {
+ VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
+ }
+}
+
+//------------------------------------------------------------------------------
+
+#define ARGB_TO_RGB_SSE41 do { \
+ while (num_pixels >= 16) { \
+ const __m128i in0 = _mm_loadu_si128(in + 0); \
+ const __m128i in1 = _mm_loadu_si128(in + 1); \
+ const __m128i in2 = _mm_loadu_si128(in + 2); \
+ const __m128i in3 = _mm_loadu_si128(in + 3); \
+ const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \
+ const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \
+ const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \
+ const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \
+ const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
+ const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
+ const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
+ _mm_storeu_si128(out + 0, b0); \
+ _mm_storeu_si128(out + 1, b1); \
+ _mm_storeu_si128(out + 2, b2); \
+ in += 4; \
+ out += 3; \
+ num_pixels -= 16; \
+ } \
+} while (0)
+
+static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
+ uint8_t* dst) {
+ const __m128i* in = (const __m128i*)src;
+ __m128i* out = (__m128i*)dst;
+ const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
+ 8, 14, 13, 12, -1, -1, -1, -1);
+ const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+ const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+ const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+ ARGB_TO_RGB_SSE41;
+
+ // left-overs
+ if (num_pixels > 0) {
+ VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+ }
+}
+
+static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ const __m128i* in = (const __m128i*)src;
+ __m128i* out = (__m128i*)dst;
+ const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
+ 12, 13, 14, -1, -1, -1, -1);
+ const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+ const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+ const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+ ARGB_TO_RGB_SSE41;
+
+ // left-overs
+ if (num_pixels > 0) {
+ VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+ }
+}
+
+#undef ARGB_TO_RGB_SSE41
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
+ VP8LTransformColorInverse = TransformColorInverse_SSE41;
+ VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
+ VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
+}
+
+#else // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
+
+#endif // WEBP_USE_SSE41
diff --git a/media/libwebp/dsp/moz.build b/media/libwebp/dsp/moz.build
index c00d5e1401..6e2c9deb2c 100644
--- a/media/libwebp/dsp/moz.build
+++ b/media/libwebp/dsp/moz.build
@@ -1,4 +1,5 @@
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
@@ -8,53 +9,103 @@ with Files('**'):
SOURCES += [
'alpha_processing.c',
- 'alpha_processing_neon.c',
- 'alpha_processing_sse2.c',
- 'alpha_processing_sse41.c',
+ 'cost.c',
'dec.c',
'dec_clip_tables.c',
- 'dec_neon.c',
- 'dec_sse2.c',
- 'dec_sse41.c',
+ 'enc.c',
'filters.c',
- 'filters_neon.c',
- 'filters_sse2.c',
'lossless.c',
- 'lossless_neon.c',
- 'lossless_sse2.c',
+ 'lossless_enc.c',
'rescaler.c',
- 'rescaler_neon.c',
- 'rescaler_sse2.c',
+ 'ssim.c',
'upsampling.c',
- 'upsampling_neon.c',
- 'upsampling_sse2.c',
- 'upsampling_sse41.c',
'yuv.c',
- 'yuv_neon.c',
- 'yuv_sse2.c',
- 'yuv_sse41.c',
]
if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']:
- SOURCES['alpha_processing_neon.c'].flags += CONFIG['NEON_FLAGS']
- SOURCES['dec_neon.c'].flags += CONFIG['NEON_FLAGS']
- SOURCES['filters_neon.c'].flags += CONFIG['NEON_FLAGS']
- SOURCES['lossless_neon.c'].flags += CONFIG['NEON_FLAGS']
- SOURCES['rescaler_neon.c'].flags += CONFIG['NEON_FLAGS']
- SOURCES['upsampling_neon.c'].flags += CONFIG['NEON_FLAGS']
- SOURCES['yuv_neon.c'].flags += CONFIG['NEON_FLAGS']
+ SOURCES += [
+ 'alpha_processing_neon.c',
+ 'cost_neon.c',
+ 'dec_neon.c',
+ 'enc_neon.c',
+ 'filters_neon.c',
+ 'lossless_enc_neon.c',
+ 'lossless_neon.c',
+ 'rescaler_neon.c',
+ 'upsampling_neon.c',
+ 'yuv_neon.c',
+ ]
+ DEFINES['WEBP_HAVE_NEON'] = 1;
+ for f in SOURCES:
+ if f.endswith('neon.c'):
+ SOURCES[f].flags += CONFIG['NEON_FLAGS']
+elif CONFIG['CPU_ARCH'] == 'aarch64':
+ SOURCES += [
+ 'alpha_processing_neon.c',
+ 'cost_neon.c',
+ 'dec_neon.c',
+ 'enc_neon.c',
+ 'filters_neon.c',
+ 'lossless_enc_neon.c',
+ 'lossless_neon.c',
+ 'rescaler_neon.c',
+ 'upsampling_neon.c',
+ 'yuv_neon.c',
+ ]
+ DEFINES['WEBP_HAVE_NEON'] = 1;
elif CONFIG['INTEL_ARCHITECTURE']:
- SOURCES['alpha_processing_sse2.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['alpha_processing_sse41.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['dec_sse2.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['dec_sse41.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['filters_sse2.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['lossless_sse2.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['rescaler_sse2.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['upsampling_sse2.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['upsampling_sse41.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['yuv_sse2.c'].flags += CONFIG['SSE2_FLAGS']
- SOURCES['yuv_sse41.c'].flags += CONFIG['SSE2_FLAGS']
+ SOURCES += [
+ 'alpha_processing_sse2.c',
+ 'alpha_processing_sse41.c',
+ 'cost_sse2.c',
+ 'dec_sse2.c',
+ 'dec_sse41.c',
+ 'enc_sse2.c',
+ 'enc_sse41.c',
+ 'filters_sse2.c',
+ 'lossless_enc_sse2.c',
+ 'lossless_enc_sse41.c',
+ 'lossless_sse2.c',
+ 'lossless_sse41.c',
+ 'rescaler_sse2.c',
+ 'ssim_sse2.c',
+ 'upsampling_sse2.c',
+ 'upsampling_sse41.c',
+ 'yuv_sse2.c',
+ 'yuv_sse41.c',
+ ]
+ DEFINES['WEBP_HAVE_SSE2'] = 1;
+ DEFINES['WEBP_HAVE_SSE41'] = 1;
+ for f in SOURCES:
+ if f.endswith('sse2.c'):
+ SOURCES[f].flags += CONFIG['SSE2_FLAGS']
+ elif f.endswith('sse41.c'):
+ SOURCES[f].flags += ['-msse4.1']
+elif CONFIG['CPU_ARCH'].startswith('mips'):
+ SOURCES += [
+ 'alpha_processing_mips_dsp_r2.c',
+ 'cost_mips32.c',
+ 'cost_mips_dsp_r2.c',
+ 'dec_mips32.c',
+ 'dec_mips_dsp_r2.c',
+ 'enc_mips32.c',
+ 'enc_mips_dsp_r2.c',
+ 'filters_mips_dsp_r2.c',
+ 'lossless_enc_mips32.c',
+ 'lossless_enc_mips_dsp_r2.c',
+ 'lossless_mips_dsp_r2.c',
+ 'lossless_msa.c',
+ 'rescaler_mips32.c',
+ 'rescaler_mips_dsp_r2.c',
+ 'rescaler_msa.c',
+ 'upsampling_mips_dsp_r2.c',
+ 'upsampling_msa.c',
+ 'yuv_mips32.c',
+ 'yuv_mips_dsp_r2.c',
+ ]
+
+if CONFIG['CC_TYPE'] in ('clang', 'clang-cl'):
+ CFLAGS += ['-Wno-unreachable-code']
FINAL_LIBRARY = 'gkmedias'
diff --git a/media/libwebp/dsp/msa_macro.h b/media/libwebp/dsp/msa_macro.h
index de026a1d9e..717e3b7b9f 100644
--- a/media/libwebp/dsp/msa_macro.h
+++ b/media/libwebp/dsp/msa_macro.h
@@ -14,6 +14,10 @@
#ifndef WEBP_DSP_MSA_MACRO_H_
#define WEBP_DSP_MSA_MACRO_H_
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
#include <stdint.h>
#include <msa.h>
@@ -1389,4 +1393,5 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
} while (0)
#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
+#endif // WEBP_USE_MSA
#endif // WEBP_DSP_MSA_MACRO_H_
diff --git a/media/libwebp/dsp/neon.h b/media/libwebp/dsp/neon.h
index 63c27a2901..9a0f630de0 100644
--- a/media/libwebp/dsp/neon.h
+++ b/media/libwebp/dsp/neon.h
@@ -12,10 +12,12 @@
#ifndef WEBP_DSP_NEON_H_
#define WEBP_DSP_NEON_H_
-#include <arm_neon.h>
-
#include "../dsp/dsp.h"
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
// Right now, some intrinsics functions seem slower, so we disable them
// everywhere except newer clang/gcc or aarch64 where the inline assembly is
// incompatible.
@@ -98,4 +100,5 @@ static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
} while (0)
#endif
+#endif // WEBP_USE_NEON
#endif // WEBP_DSP_NEON_H_
diff --git a/media/libwebp/dsp/quant.h b/media/libwebp/dsp/quant.h
index b82e728a53..14d8613431 100644
--- a/media/libwebp/dsp/quant.h
+++ b/media/libwebp/dsp/quant.h
@@ -10,6 +10,8 @@
#ifndef WEBP_DSP_QUANT_H_
#define WEBP_DSP_QUANT_H_
+#include <string.h>
+
#include "../dsp/dsp.h"
#include "../webp/types.h"
@@ -67,4 +69,17 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
#endif // defined(WEBP_USE_NEON) && !defined(WEBP_ANDROID_NEON) &&
// !defined(WEBP_HAVE_NEON_RTCD)
+static WEBP_INLINE int IsFlatSource16(const uint8_t* src) {
+ const uint32_t v = src[0] * 0x01010101u;
+ int i;
+ for (i = 0; i < 16; ++i) {
+ if (memcmp(src + 0, &v, 4) || memcmp(src + 4, &v, 4) ||
+ memcmp(src + 8, &v, 4) || memcmp(src + 12, &v, 4)) {
+ return 0;
+ }
+ src += BPS;
+ }
+ return 1;
+}
+
#endif // WEBP_DSP_QUANT_H_
diff --git a/media/libwebp/dsp/rescaler.c b/media/libwebp/dsp/rescaler.c
index 6bf387f8e0..4bbd281b1c 100644
--- a/media/libwebp/dsp/rescaler.c
+++ b/media/libwebp/dsp/rescaler.c
@@ -38,8 +38,9 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
int x_out = channel;
// simple bilinear interpolation
int accum = wrk->x_add;
- int left = src[x_in];
- int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+ rescaler_t left = (rescaler_t)src[x_in];
+ rescaler_t right =
+ (wrk->src_width > 1) ? (rescaler_t)src[x_in + x_stride] : left;
x_in += x_stride;
while (1) {
wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
@@ -50,7 +51,7 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
left = right;
x_in += x_stride;
assert(x_in < wrk->src_width * x_stride);
- right = src[x_in];
+ right = (rescaler_t)src[x_in];
accum += wrk->x_add;
}
}
@@ -109,8 +110,7 @@ void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
for (x_out = 0; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX(J, wrk->fy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
@@ -120,8 +120,7 @@ void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX(J, wrk->fy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
}
}
@@ -138,17 +137,15 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
assert(!wrk->y_expand);
if (yscale) {
for (x_out = 0; x_out < x_out_max; ++x_out) {
- const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
- const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
+ const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = frac; // new fractional start
}
} else {
for (x_out = 0; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = 0;
}
}
@@ -217,7 +214,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPRescalerDspInitSSE2();
}
@@ -239,7 +236,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPRescalerDspInitNEON();
diff --git a/media/libwebp/dsp/rescaler_mips32.c b/media/libwebp/dsp/rescaler_mips32.c
new file mode 100644
index 0000000000..44fad3fbe7
--- /dev/null
+++ b/media/libwebp/dsp/rescaler_mips32.c
@@ -0,0 +1,295 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of rescaling functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+#include "../utils/rescaler_utils.h"
+
+//------------------------------------------------------------------------------
+// Row import
+
+static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
+ const uint8_t* src) {
+ const int x_stride = wrk->num_channels;
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ const int fx_scale = wrk->fx_scale;
+ const int x_add = wrk->x_add;
+ const int x_sub = wrk->x_sub;
+ const int x_stride1 = x_stride << 2;
+ int channel;
+ assert(!wrk->x_expand);
+ assert(!WebPRescalerInputDone(wrk));
+
+ for (channel = 0; channel < x_stride; ++channel) {
+ const uint8_t* src1 = src + channel;
+ rescaler_t* frow = wrk->frow + channel;
+ int temp1, temp2, temp3;
+ int base, frac, sum;
+ int accum, accum1;
+ int loop_c = x_out_max - channel;
+
+ __asm__ volatile (
+ "li %[temp1], 0x8000 \n\t"
+ "li %[temp2], 0x10000 \n\t"
+ "li %[sum], 0 \n\t"
+ "li %[accum], 0 \n\t"
+ "1: \n\t"
+ "addu %[accum], %[accum], %[x_add] \n\t"
+ "li %[base], 0 \n\t"
+ "blez %[accum], 3f \n\t"
+ "2: \n\t"
+ "lbu %[base], 0(%[src1]) \n\t"
+ "subu %[accum], %[accum], %[x_sub] \n\t"
+ "addu %[src1], %[src1], %[x_stride] \n\t"
+ "addu %[sum], %[sum], %[base] \n\t"
+ "bgtz %[accum], 2b \n\t"
+ "3: \n\t"
+ "negu %[accum1], %[accum] \n\t"
+ "mul %[frac], %[base], %[accum1] \n\t"
+ "mul %[temp3], %[sum], %[x_sub] \n\t"
+ "subu %[loop_c], %[loop_c], %[x_stride] \n\t"
+ "mult %[temp1], %[temp2] \n\t"
+ "maddu %[frac], %[fx_scale] \n\t"
+ "mfhi %[sum] \n\t"
+ "subu %[temp3], %[temp3], %[frac] \n\t"
+ "sw %[temp3], 0(%[frow]) \n\t"
+ "addu %[frow], %[frow], %[x_stride1] \n\t"
+ "bgtz %[loop_c], 1b \n\t"
+ : [accum]"=&r"(accum), [src1]"+r"(src1), [temp3]"=&r"(temp3),
+ [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
+ [frow]"+r"(frow), [accum1]"=&r"(accum1),
+ [temp2]"=&r"(temp2), [temp1]"=&r"(temp1)
+ : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale),
+ [x_sub]"r"(x_sub), [x_add]"r"(x_add),
+ [loop_c]"r"(loop_c), [x_stride1]"r"(x_stride1)
+ : "memory", "hi", "lo"
+ );
+ assert(accum == 0);
+ }
+}
+
+static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
+ const uint8_t* src) {
+ const int x_stride = wrk->num_channels;
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ const int x_add = wrk->x_add;
+ const int x_sub = wrk->x_sub;
+ const int src_width = wrk->src_width;
+ const int x_stride1 = x_stride << 2;
+ int channel;
+ assert(wrk->x_expand);
+ assert(!WebPRescalerInputDone(wrk));
+
+ for (channel = 0; channel < x_stride; ++channel) {
+ const uint8_t* src1 = src + channel;
+ rescaler_t* frow = wrk->frow + channel;
+ int temp1, temp2, temp3, temp4;
+ int frac;
+ int accum;
+ int x_out = channel;
+
+ __asm__ volatile (
+ "addiu %[temp3], %[src_width], -1 \n\t"
+ "lbu %[temp2], 0(%[src1]) \n\t"
+ "addu %[src1], %[src1], %[x_stride] \n\t"
+ "bgtz %[temp3], 0f \n\t"
+ "addiu %[temp1], %[temp2], 0 \n\t"
+ "b 3f \n\t"
+ "0: \n\t"
+ "lbu %[temp1], 0(%[src1]) \n\t"
+ "3: \n\t"
+ "addiu %[accum], %[x_add], 0 \n\t"
+ "1: \n\t"
+ "subu %[temp3], %[temp2], %[temp1] \n\t"
+ "mul %[temp3], %[temp3], %[accum] \n\t"
+ "mul %[temp4], %[temp1], %[x_add] \n\t"
+ "addu %[temp3], %[temp4], %[temp3] \n\t"
+ "sw %[temp3], 0(%[frow]) \n\t"
+ "addu %[frow], %[frow], %[x_stride1] \n\t"
+ "addu %[x_out], %[x_out], %[x_stride] \n\t"
+ "subu %[temp3], %[x_out], %[x_out_max] \n\t"
+ "bgez %[temp3], 2f \n\t"
+ "subu %[accum], %[accum], %[x_sub] \n\t"
+ "bgez %[accum], 4f \n\t"
+ "addiu %[temp2], %[temp1], 0 \n\t"
+ "addu %[src1], %[src1], %[x_stride] \n\t"
+ "lbu %[temp1], 0(%[src1]) \n\t"
+ "addu %[accum], %[accum], %[x_add] \n\t"
+ "4: \n\t"
+ "b 1b \n\t"
+ "2: \n\t"
+ : [src1]"+r"(src1), [accum]"=&r"(accum), [temp1]"=&r"(temp1),
+ [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+ [x_out]"+r"(x_out), [frac]"=&r"(frac), [frow]"+r"(frow)
+ : [x_stride]"r"(x_stride), [x_add]"r"(x_add), [x_sub]"r"(x_sub),
+ [x_stride1]"r"(x_stride1), [src_width]"r"(src_width),
+ [x_out_max]"r"(x_out_max)
+ : "memory", "hi", "lo"
+ );
+ assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
+ uint8_t* dst = wrk->dst;
+ rescaler_t* irow = wrk->irow;
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ const rescaler_t* frow = wrk->frow;
+ int temp0, temp1, temp3, temp4, temp5, loop_end;
+ const int temp2 = (int)wrk->fy_scale;
+ const int temp6 = x_out_max << 2;
+ assert(!WebPRescalerOutputDone(wrk));
+ assert(wrk->y_accum <= 0);
+ assert(wrk->y_expand);
+ assert(wrk->y_sub != 0);
+ if (wrk->y_accum == 0) {
+ __asm__ volatile (
+ "li %[temp3], 0x10000 \n\t"
+ "li %[temp4], 0x8000 \n\t"
+ "addu %[loop_end], %[frow], %[temp6] \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[frow]) \n\t"
+ "addiu %[dst], %[dst], 1 \n\t"
+ "addiu %[frow], %[frow], 4 \n\t"
+ "mult %[temp3], %[temp4] \n\t"
+ "maddu %[temp0], %[temp2] \n\t"
+ "mfhi %[temp5] \n\t"
+ "sb %[temp5], -1(%[dst]) \n\t"
+ "bne %[frow], %[loop_end], 1b \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+ [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+ : [temp2]"r"(temp2), [temp6]"r"(temp6)
+ : "memory", "hi", "lo"
+ );
+ } else {
+ const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+ const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+ __asm__ volatile (
+ "li %[temp3], 0x10000 \n\t"
+ "li %[temp4], 0x8000 \n\t"
+ "addu %[loop_end], %[frow], %[temp6] \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[frow]) \n\t"
+ "lw %[temp1], 0(%[irow]) \n\t"
+ "addiu %[dst], %[dst], 1 \n\t"
+ "mult %[temp3], %[temp4] \n\t"
+ "maddu %[A], %[temp0] \n\t"
+ "maddu %[B], %[temp1] \n\t"
+ "addiu %[frow], %[frow], 4 \n\t"
+ "addiu %[irow], %[irow], 4 \n\t"
+ "mfhi %[temp5] \n\t"
+ "mult %[temp3], %[temp4] \n\t"
+ "maddu %[temp5], %[temp2] \n\t"
+ "mfhi %[temp5] \n\t"
+ "sb %[temp5], -1(%[dst]) \n\t"
+ "bne %[frow], %[loop_end], 1b \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+ [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+ : [temp2]"r"(temp2), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+ : "memory", "hi", "lo"
+ );
+ }
+}
+
+#if 0 // disabled for now. TODO(skal): make match the C-code
+static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ uint8_t* dst = wrk->dst;
+ rescaler_t* irow = wrk->irow;
+ const rescaler_t* frow = wrk->frow;
+ const int yscale = wrk->fy_scale * (-wrk->y_accum);
+ int temp0, temp1, temp3, temp4, temp5, loop_end;
+ const int temp2 = (int)wrk->fxy_scale;
+ const int temp6 = x_out_max << 2;
+
+ assert(!WebPRescalerOutputDone(wrk));
+ assert(wrk->y_accum <= 0);
+ assert(!wrk->y_expand);
+ assert(wrk->fxy_scale != 0);
+ if (yscale) {
+ __asm__ volatile (
+ "li %[temp3], 0x10000 \n\t"
+ "li %[temp4], 0x8000 \n\t"
+ "addu %[loop_end], %[frow], %[temp6] \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[frow]) \n\t"
+ "mult %[temp3], %[temp4] \n\t"
+ "addiu %[frow], %[frow], 4 \n\t"
+ "maddu %[temp0], %[yscale] \n\t"
+ "mfhi %[temp1] \n\t"
+ "lw %[temp0], 0(%[irow]) \n\t"
+ "addiu %[dst], %[dst], 1 \n\t"
+ "addiu %[irow], %[irow], 4 \n\t"
+ "subu %[temp0], %[temp0], %[temp1] \n\t"
+ "mult %[temp3], %[temp4] \n\t"
+ "maddu %[temp0], %[temp2] \n\t"
+ "mfhi %[temp5] \n\t"
+ "sw %[temp1], -4(%[irow]) \n\t"
+ "sb %[temp5], -1(%[dst]) \n\t"
+ "bne %[frow], %[loop_end], 1b \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+ [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+ : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp6]"r"(temp6)
+ : "memory", "hi", "lo"
+ );
+ } else {
+ __asm__ volatile (
+ "li %[temp3], 0x10000 \n\t"
+ "li %[temp4], 0x8000 \n\t"
+ "addu %[loop_end], %[irow], %[temp6] \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[irow]) \n\t"
+ "addiu %[dst], %[dst], 1 \n\t"
+ "addiu %[irow], %[irow], 4 \n\t"
+ "mult %[temp3], %[temp4] \n\t"
+ "maddu %[temp0], %[temp2] \n\t"
+ "mfhi %[temp5] \n\t"
+ "sw $zero, -4(%[irow]) \n\t"
+ "sb %[temp5], -1(%[dst]) \n\t"
+ "bne %[irow], %[loop_end], 1b \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+ [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+ : [temp2]"r"(temp2), [temp6]"r"(temp6)
+ : "memory", "hi", "lo"
+ );
+ }
+}
+#endif // 0
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
+ WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
+ WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
+ WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
+// WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
+}
+
+#else // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPS32)
+
+#endif // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/rescaler_mips_dsp_r2.c b/media/libwebp/dsp/rescaler_mips_dsp_r2.c
new file mode 100644
index 0000000000..d6f2996578
--- /dev/null
+++ b/media/libwebp/dsp/rescaler_mips_dsp_r2.c
@@ -0,0 +1,314 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of rescaling functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+#include "../utils/rescaler_utils.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
+
+//------------------------------------------------------------------------------
+// Row export
+
+#if 0 // disabled for now. TODO(skal): make match the C-code
+static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
+ int i;
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ uint8_t* dst = wrk->dst;
+ rescaler_t* irow = wrk->irow;
+ const rescaler_t* frow = wrk->frow;
+ const int yscale = wrk->fy_scale * (-wrk->y_accum);
+ int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
+ const int temp7 = (int)wrk->fxy_scale;
+ const int temp6 = (x_out_max & ~0x3) << 2;
+ assert(!WebPRescalerOutputDone(wrk));
+ assert(wrk->y_accum <= 0);
+ assert(!wrk->y_expand);
+ assert(wrk->fxy_scale != 0);
+ if (yscale) {
+ if (x_out_max >= 4) {
+ int temp8, temp9, temp10, temp11;
+ __asm__ volatile (
+ "li %[temp3], 0x10000 \n\t"
+ "li %[temp4], 0x8000 \n\t"
+ "addu %[loop_end], %[frow], %[temp6] \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[frow]) \n\t"
+ "lw %[temp1], 4(%[frow]) \n\t"
+ "lw %[temp2], 8(%[frow]) \n\t"
+ "lw %[temp5], 12(%[frow]) \n\t"
+ "mult $ac0, %[temp3], %[temp4] \n\t"
+ "maddu $ac0, %[temp0], %[yscale] \n\t"
+ "mult $ac1, %[temp3], %[temp4] \n\t"
+ "maddu $ac1, %[temp1], %[yscale] \n\t"
+ "mult $ac2, %[temp3], %[temp4] \n\t"
+ "maddu $ac2, %[temp2], %[yscale] \n\t"
+ "mult $ac3, %[temp3], %[temp4] \n\t"
+ "maddu $ac3, %[temp5], %[yscale] \n\t"
+ "addiu %[frow], %[frow], 16 \n\t"
+ "mfhi %[temp0], $ac0 \n\t"
+ "mfhi %[temp1], $ac1 \n\t"
+ "mfhi %[temp2], $ac2 \n\t"
+ "mfhi %[temp5], $ac3 \n\t"
+ "lw %[temp8], 0(%[irow]) \n\t"
+ "lw %[temp9], 4(%[irow]) \n\t"
+ "lw %[temp10], 8(%[irow]) \n\t"
+ "lw %[temp11], 12(%[irow]) \n\t"
+ "addiu %[dst], %[dst], 4 \n\t"
+ "addiu %[irow], %[irow], 16 \n\t"
+ "subu %[temp8], %[temp8], %[temp0] \n\t"
+ "subu %[temp9], %[temp9], %[temp1] \n\t"
+ "subu %[temp10], %[temp10], %[temp2] \n\t"
+ "subu %[temp11], %[temp11], %[temp5] \n\t"
+ "mult $ac0, %[temp3], %[temp4] \n\t"
+ "maddu $ac0, %[temp8], %[temp7] \n\t"
+ "mult $ac1, %[temp3], %[temp4] \n\t"
+ "maddu $ac1, %[temp9], %[temp7] \n\t"
+ "mult $ac2, %[temp3], %[temp4] \n\t"
+ "maddu $ac2, %[temp10], %[temp7] \n\t"
+ "mult $ac3, %[temp3], %[temp4] \n\t"
+ "maddu $ac3, %[temp11], %[temp7] \n\t"
+ "mfhi %[temp8], $ac0 \n\t"
+ "mfhi %[temp9], $ac1 \n\t"
+ "mfhi %[temp10], $ac2 \n\t"
+ "mfhi %[temp11], $ac3 \n\t"
+ "sw %[temp0], -16(%[irow]) \n\t"
+ "sw %[temp1], -12(%[irow]) \n\t"
+ "sw %[temp2], -8(%[irow]) \n\t"
+ "sw %[temp5], -4(%[irow]) \n\t"
+ "sb %[temp8], -4(%[dst]) \n\t"
+ "sb %[temp9], -3(%[dst]) \n\t"
+ "sb %[temp10], -2(%[dst]) \n\t"
+ "sb %[temp11], -1(%[dst]) \n\t"
+ "bne %[frow], %[loop_end], 1b \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+ [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
+ [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
+ [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
+ : [temp7]"r"(temp7), [yscale]"r"(yscale), [temp6]"r"(temp6)
+ : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+ "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+ );
+ }
+ for (i = 0; i < (x_out_max & 0x3); ++i) {
+ const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(*frow++, yscale);
+ const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
+ *dst++ = (v > 255) ? 255u : (uint8_t)v;
+ *irow++ = frac; // new fractional start
+ }
+ } else {
+ if (x_out_max >= 4) {
+ __asm__ volatile (
+ "li %[temp3], 0x10000 \n\t"
+ "li %[temp4], 0x8000 \n\t"
+ "addu %[loop_end], %[irow], %[temp6] \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[irow]) \n\t"
+ "lw %[temp1], 4(%[irow]) \n\t"
+ "lw %[temp2], 8(%[irow]) \n\t"
+ "lw %[temp5], 12(%[irow]) \n\t"
+ "addiu %[dst], %[dst], 4 \n\t"
+ "addiu %[irow], %[irow], 16 \n\t"
+ "mult $ac0, %[temp3], %[temp4] \n\t"
+ "maddu $ac0, %[temp0], %[temp7] \n\t"
+ "mult $ac1, %[temp3], %[temp4] \n\t"
+ "maddu $ac1, %[temp1], %[temp7] \n\t"
+ "mult $ac2, %[temp3], %[temp4] \n\t"
+ "maddu $ac2, %[temp2], %[temp7] \n\t"
+ "mult $ac3, %[temp3], %[temp4] \n\t"
+ "maddu $ac3, %[temp5], %[temp7] \n\t"
+ "mfhi %[temp0], $ac0 \n\t"
+ "mfhi %[temp1], $ac1 \n\t"
+ "mfhi %[temp2], $ac2 \n\t"
+ "mfhi %[temp5], $ac3 \n\t"
+ "sw $zero, -16(%[irow]) \n\t"
+ "sw $zero, -12(%[irow]) \n\t"
+ "sw $zero, -8(%[irow]) \n\t"
+ "sw $zero, -4(%[irow]) \n\t"
+ "sb %[temp0], -4(%[dst]) \n\t"
+ "sb %[temp1], -3(%[dst]) \n\t"
+ "sb %[temp2], -2(%[dst]) \n\t"
+ "sb %[temp5], -1(%[dst]) \n\t"
+ "bne %[irow], %[loop_end], 1b \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+ [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
+ : [temp7]"r"(temp7), [temp6]"r"(temp6)
+ : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+ "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+ );
+ }
+ for (i = 0; i < (x_out_max & 0x3); ++i) {
+ const int v = (int)MULT_FIX_FLOOR(*irow, wrk->fxy_scale);
+ *dst++ = (v > 255) ? 255u : (uint8_t)v;
+ *irow++ = 0;
+ }
+ }
+}
+#endif // 0
+
+static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
+ int i;
+ uint8_t* dst = wrk->dst;
+ rescaler_t* irow = wrk->irow;
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ const rescaler_t* frow = wrk->frow;
+ int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
+ const int temp6 = (x_out_max & ~0x3) << 2;
+ const int temp7 = (int)wrk->fy_scale;
+ assert(!WebPRescalerOutputDone(wrk));
+ assert(wrk->y_accum <= 0);
+ assert(wrk->y_expand);
+ assert(wrk->y_sub != 0);
+ if (wrk->y_accum == 0) {
+ if (x_out_max >= 4) {
+ __asm__ volatile (
+ "li %[temp4], 0x10000 \n\t"
+ "li %[temp5], 0x8000 \n\t"
+ "addu %[loop_end], %[frow], %[temp6] \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[frow]) \n\t"
+ "lw %[temp1], 4(%[frow]) \n\t"
+ "lw %[temp2], 8(%[frow]) \n\t"
+ "lw %[temp3], 12(%[frow]) \n\t"
+ "addiu %[dst], %[dst], 4 \n\t"
+ "addiu %[frow], %[frow], 16 \n\t"
+ "mult $ac0, %[temp4], %[temp5] \n\t"
+ "maddu $ac0, %[temp0], %[temp7] \n\t"
+ "mult $ac1, %[temp4], %[temp5] \n\t"
+ "maddu $ac1, %[temp1], %[temp7] \n\t"
+ "mult $ac2, %[temp4], %[temp5] \n\t"
+ "maddu $ac2, %[temp2], %[temp7] \n\t"
+ "mult $ac3, %[temp4], %[temp5] \n\t"
+ "maddu $ac3, %[temp3], %[temp7] \n\t"
+ "mfhi %[temp0], $ac0 \n\t"
+ "mfhi %[temp1], $ac1 \n\t"
+ "mfhi %[temp2], $ac2 \n\t"
+ "mfhi %[temp3], $ac3 \n\t"
+ "sb %[temp0], -4(%[dst]) \n\t"
+ "sb %[temp1], -3(%[dst]) \n\t"
+ "sb %[temp2], -2(%[dst]) \n\t"
+ "sb %[temp3], -1(%[dst]) \n\t"
+ "bne %[frow], %[loop_end], 1b \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+ [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
+ : [temp7]"r"(temp7), [temp6]"r"(temp6)
+ : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+ "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+ );
+ }
+ for (i = 0; i < (x_out_max & 0x3); ++i) {
+ const uint32_t J = *frow++;
+ const int v = (int)MULT_FIX(J, wrk->fy_scale);
+ *dst++ = (v > 255) ? 255u : (uint8_t)v;
+ }
+ } else {
+ const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+ const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+ if (x_out_max >= 4) {
+ int temp8, temp9, temp10, temp11;
+ __asm__ volatile (
+ "li %[temp8], 0x10000 \n\t"
+ "li %[temp9], 0x8000 \n\t"
+ "addu %[loop_end], %[frow], %[temp6] \n\t"
+ "1: \n\t"
+ "lw %[temp0], 0(%[frow]) \n\t"
+ "lw %[temp1], 4(%[frow]) \n\t"
+ "lw %[temp2], 8(%[frow]) \n\t"
+ "lw %[temp3], 12(%[frow]) \n\t"
+ "lw %[temp4], 0(%[irow]) \n\t"
+ "lw %[temp5], 4(%[irow]) \n\t"
+ "lw %[temp10], 8(%[irow]) \n\t"
+ "lw %[temp11], 12(%[irow]) \n\t"
+ "addiu %[dst], %[dst], 4 \n\t"
+ "mult $ac0, %[temp8], %[temp9] \n\t"
+ "maddu $ac0, %[A], %[temp0] \n\t"
+ "maddu $ac0, %[B], %[temp4] \n\t"
+ "mult $ac1, %[temp8], %[temp9] \n\t"
+ "maddu $ac1, %[A], %[temp1] \n\t"
+ "maddu $ac1, %[B], %[temp5] \n\t"
+ "mult $ac2, %[temp8], %[temp9] \n\t"
+ "maddu $ac2, %[A], %[temp2] \n\t"
+ "maddu $ac2, %[B], %[temp10] \n\t"
+ "mult $ac3, %[temp8], %[temp9] \n\t"
+ "maddu $ac3, %[A], %[temp3] \n\t"
+ "maddu $ac3, %[B], %[temp11] \n\t"
+ "addiu %[frow], %[frow], 16 \n\t"
+ "addiu %[irow], %[irow], 16 \n\t"
+ "mfhi %[temp0], $ac0 \n\t"
+ "mfhi %[temp1], $ac1 \n\t"
+ "mfhi %[temp2], $ac2 \n\t"
+ "mfhi %[temp3], $ac3 \n\t"
+ "mult $ac0, %[temp8], %[temp9] \n\t"
+ "maddu $ac0, %[temp0], %[temp7] \n\t"
+ "mult $ac1, %[temp8], %[temp9] \n\t"
+ "maddu $ac1, %[temp1], %[temp7] \n\t"
+ "mult $ac2, %[temp8], %[temp9] \n\t"
+ "maddu $ac2, %[temp2], %[temp7] \n\t"
+ "mult $ac3, %[temp8], %[temp9] \n\t"
+ "maddu $ac3, %[temp3], %[temp7] \n\t"
+ "mfhi %[temp0], $ac0 \n\t"
+ "mfhi %[temp1], $ac1 \n\t"
+ "mfhi %[temp2], $ac2 \n\t"
+ "mfhi %[temp3], $ac3 \n\t"
+ "sb %[temp0], -4(%[dst]) \n\t"
+ "sb %[temp1], -3(%[dst]) \n\t"
+ "sb %[temp2], -2(%[dst]) \n\t"
+ "sb %[temp3], -1(%[dst]) \n\t"
+ "bne %[frow], %[loop_end], 1b \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+ [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+ [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
+ [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
+ [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
+ : [temp7]"r"(temp7), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+ : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+ "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+ );
+ }
+ for (i = 0; i < (x_out_max & 0x3); ++i) {
+ const uint64_t I = (uint64_t)A * *frow++
+ + (uint64_t)B * *irow++;
+ const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+ const int v = (int)MULT_FIX(J, wrk->fy_scale);
+ *dst++ = (v > 255) ? 255u : (uint8_t)v;
+ }
+ }
+}
+
+#undef MULT_FIX_FLOOR
+#undef MULT_FIX
+#undef ROUNDER
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
+ WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
+// WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/rescaler_msa.c b/media/libwebp/dsp/rescaler_msa.c
new file mode 100644
index 0000000000..3366b6d637
--- /dev/null
+++ b/media/libwebp/dsp/rescaler_msa.c
@@ -0,0 +1,443 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of rescaling functions
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+
+#include "../utils/rescaler_utils.h"
+#include "../dsp/msa_macro.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
+
+#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
+ v4u32 tmp0, tmp1, tmp2, tmp3; \
+ v16u8 t0, t1, t2, t3, t4, t5; \
+ v2u64 out0, out1, out2, out3; \
+ ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
+ ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
+ ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
+ ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
+ PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
+ dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
+} while (0)
+
+#define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
+ v4u32 tmp0, tmp1; \
+ v16i8 t0, t1; \
+ v2u64 out0, out1; \
+ ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
+ DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
+ SRAR_D2_UD(out0, out1, shift); \
+ t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
+ t1 = __msa_pckev_b(t0, t0); \
+ t0 = __msa_pckev_b(t1, t1); \
+ dst = __msa_copy_s_w((v4i32)t0, 0); \
+} while (0)
+
+#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
+ dst0, dst1, dst2, dst3) do { \
+ v4u32 tmp0, tmp1, tmp2, tmp3; \
+ v2u64 out0, out1, out2, out3; \
+ ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
+ ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
+ ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
+ ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
+} while (0)
+
+#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
+ v4u32 tmp0, tmp1; \
+ v2u64 out0, out1; \
+ ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
+ DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
+ SRAR_D2_UD(out0, out1, shift); \
+ dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
+} while (0)
+
+#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
+ dst0, dst1) do { \
+ v4u32 tmp0, tmp1, tmp2, tmp3; \
+ v2u64 out0, out1, out2, out3; \
+ ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
+ ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
+ DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
+} while (0)
+
+#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
+ v4u32 tmp0, tmp1; \
+ v2u64 out0, out1; \
+ v16i8 t0, t1; \
+ ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
+ DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
+ SRAR_D2_UD(out0, out1, shift); \
+ DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
+ SRAR_D2_UD(out0, out1, shift); \
+ t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
+ t1 = __msa_pckev_b(t0, t0); \
+ t0 = __msa_pckev_b(t1, t1); \
+ dst = __msa_copy_s_w((v4i32)t0, 0); \
+} while (0)
+
+static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
+ int length,
+ WebPRescaler* const wrk) {
+ const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+ const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+ const v4i32 zero = { 0 };
+
+ while (length >= 16) {
+ v4u32 src0, src1, src2, src3;
+ v16u8 out;
+ LD_UW4(frow, 4, src0, src1, src2, src3);
+ CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
+ ST_UB(out, dst);
+ length -= 16;
+ frow += 16;
+ dst += 16;
+ }
+ if (length > 0) {
+ int x_out;
+ if (length >= 12) {
+ uint32_t val0_m, val1_m, val2_m;
+ v4u32 src0, src1, src2;
+ LD_UW3(frow, 4, src0, src1, src2);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+ CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+ SW3(val0_m, val1_m, val2_m, dst, 4);
+ length -= 12;
+ frow += 12;
+ dst += 12;
+ } else if (length >= 8) {
+ uint32_t val0_m, val1_m;
+ v4u32 src0, src1;
+ LD_UW2(frow, 4, src0, src1);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+ SW2(val0_m, val1_m, dst, 4);
+ length -= 8;
+ frow += 8;
+ dst += 8;
+ } else if (length >= 4) {
+ uint32_t val0_m;
+ const v4u32 src0 = LD_UW(frow);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ SW(val0_m, dst);
+ length -= 4;
+ frow += 4;
+ dst += 4;
+ }
+ for (x_out = 0; x_out < length; ++x_out) {
+ const uint32_t J = frow[x_out];
+ const int v = (int)MULT_FIX(J, wrk->fy_scale);
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+ }
+ }
+}
+
+static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
+ uint8_t* dst, int length,
+ WebPRescaler* const wrk) {
+ const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+ const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+ const v4i32 B1 = __msa_fill_w(B);
+ const v4i32 A1 = __msa_fill_w(A);
+ const v4i32 AB = __msa_ilvr_w(A1, B1);
+ const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+ const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+
+ while (length >= 16) {
+ v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
+ v16u8 t0, t1, t2, t3, t4, t5;
+ LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
+ LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
+ CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
+ CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
+ PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
+ t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
+ ST_UB(t0, dst);
+ frow += 16;
+ irow += 16;
+ dst += 16;
+ length -= 16;
+ }
+ if (length > 0) {
+ int x_out;
+ if (length >= 12) {
+ uint32_t val0_m, val1_m, val2_m;
+ v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
+ LD_UW3(frow, 4, frow0, frow1, frow2);
+ LD_UW3(irow, 4, irow0, irow1, irow2);
+ CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+ CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+ CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
+ SW3(val0_m, val1_m, val2_m, dst, 4);
+ frow += 12;
+ irow += 12;
+ dst += 12;
+ length -= 12;
+ } else if (length >= 8) {
+ uint32_t val0_m, val1_m;
+ v4u32 frow0, frow1, irow0, irow1;
+ LD_UW2(frow, 4, frow0, frow1);
+ LD_UW2(irow, 4, irow0, irow1);
+ CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+ CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+ SW2(val0_m, val1_m, dst, 4);
+ frow += 4;
+ irow += 4;
+ dst += 4;
+ length -= 4;
+ } else if (length >= 4) {
+ uint32_t val0_m;
+ const v4u32 frow0 = LD_UW(frow + 0);
+ const v4u32 irow0 = LD_UW(irow + 0);
+ CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+ SW(val0_m, dst);
+ frow += 4;
+ irow += 4;
+ dst += 4;
+ length -= 4;
+ }
+ for (x_out = 0; x_out < length; ++x_out) {
+ const uint64_t I = (uint64_t)A * frow[x_out]
+ + (uint64_t)B * irow[x_out];
+ const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+ const int v = (int)MULT_FIX(J, wrk->fy_scale);
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+ }
+ }
+}
+
+static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
+ uint8_t* dst = wrk->dst;
+ rescaler_t* irow = wrk->irow;
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ const rescaler_t* frow = wrk->frow;
+ assert(!WebPRescalerOutputDone(wrk));
+ assert(wrk->y_accum <= 0);
+ assert(wrk->y_expand);
+ assert(wrk->y_sub != 0);
+ if (wrk->y_accum == 0) {
+ ExportRowExpand_0(frow, dst, x_out_max, wrk);
+ } else {
+ ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
+ }
+}
+
+#if 0 // disabled for now. TODO(skal): make match the C-code
+static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
+ uint8_t* dst, int length,
+ const uint32_t yscale,
+ WebPRescaler* const wrk) {
+ const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
+ const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+ const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+ const v4i32 zero = { 0 };
+
+ while (length >= 16) {
+ v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
+ v16u8 out;
+ LD_UW4(frow, 4, src0, src1, src2, src3);
+ CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
+ frac0, frac1, frac2, frac3);
+ LD_UW4(irow, 4, src0, src1, src2, src3);
+ SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
+ src0, src1, src2, src3);
+ CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
+ ST_UB(out, dst);
+ ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
+ frow += 16;
+ irow += 16;
+ dst += 16;
+ length -= 16;
+ }
+ if (length > 0) {
+ int x_out;
+ if (length >= 12) {
+ uint32_t val0_m, val1_m, val2_m;
+ v4u32 src0, src1, src2, frac0, frac1, frac2;
+ LD_UW3(frow, 4, src0, src1, src2);
+ CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+ CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+ CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
+ LD_UW3(irow, 4, src0, src1, src2);
+ SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
+ CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+ CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+ CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
+ SW3(val0_m, val1_m, val2_m, dst, 4);
+ ST_UW3(frac0, frac1, frac2, irow, 4);
+ frow += 12;
+ irow += 12;
+ dst += 12;
+ length -= 12;
+ } else if (length >= 8) {
+ uint32_t val0_m, val1_m;
+ v4u32 src0, src1, frac0, frac1;
+ LD_UW2(frow, 4, src0, src1);
+ CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+ CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+ LD_UW2(irow, 4, src0, src1);
+ SUB2(src0, frac0, src1, frac1, src0, src1);
+ CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+ CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+ SW2(val0_m, val1_m, dst, 4);
+ ST_UW2(frac0, frac1, irow, 4);
+ frow += 8;
+ irow += 8;
+ dst += 8;
+ length -= 8;
+ } else if (length >= 4) {
+ uint32_t val0_m;
+ v4u32 frac0;
+ v4u32 src0 = LD_UW(frow);
+ CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+ src0 = LD_UW(irow);
+ src0 = src0 - frac0;
+ CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+ SW(val0_m, dst);
+ ST_UW(frac0, irow);
+ frow += 4;
+ irow += 4;
+ dst += 4;
+ length -= 4;
+ }
+ for (x_out = 0; x_out < length; ++x_out) {
+ const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
+ const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+ irow[x_out] = frac;
+ }
+ }
+}
+
+static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
+ int length,
+ WebPRescaler* const wrk) {
+ const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+ const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+ const v4i32 zero = { 0 };
+
+ while (length >= 16) {
+ v4u32 src0, src1, src2, src3;
+ v16u8 dst0;
+ LD_UW4(irow, 4, src0, src1, src2, src3);
+ CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
+ ST_UB(dst0, dst);
+ ST_SW4(zero, zero, zero, zero, irow, 4);
+ length -= 16;
+ irow += 16;
+ dst += 16;
+ }
+ if (length > 0) {
+ int x_out;
+ if (length >= 12) {
+ uint32_t val0_m, val1_m, val2_m;
+ v4u32 src0, src1, src2;
+ LD_UW3(irow, 4, src0, src1, src2);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+ CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+ SW3(val0_m, val1_m, val2_m, dst, 4);
+ ST_SW3(zero, zero, zero, irow, 4);
+ length -= 12;
+ irow += 12;
+ dst += 12;
+ } else if (length >= 8) {
+ uint32_t val0_m, val1_m;
+ v4u32 src0, src1;
+ LD_UW2(irow, 4, src0, src1);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+ SW2(val0_m, val1_m, dst, 4);
+ ST_SW2(zero, zero, irow, 4);
+ length -= 8;
+ irow += 8;
+ dst += 8;
+ } else if (length >= 4) {
+ uint32_t val0_m;
+ const v4u32 src0 = LD_UW(irow + 0);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ SW(val0_m, dst);
+ ST_SW(zero, irow);
+ length -= 4;
+ irow += 4;
+ dst += 4;
+ }
+ for (x_out = 0; x_out < length; ++x_out) {
+ const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+ irow[x_out] = 0;
+ }
+ }
+}
+
+static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
+ uint8_t* dst = wrk->dst;
+ rescaler_t* irow = wrk->irow;
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ const rescaler_t* frow = wrk->frow;
+ const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+ assert(!WebPRescalerOutputDone(wrk));
+ assert(wrk->y_accum <= 0);
+ assert(!wrk->y_expand);
+ if (yscale) {
+ ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
+ } else {
+ ExportRowShrink_1(irow, dst, x_out_max, wrk);
+ }
+}
+#endif // 0
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
+ WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
+// WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
+}
+
+#else // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
+
+#endif // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/rescaler_neon.c b/media/libwebp/dsp/rescaler_neon.c
index b560d0cdcc..62bef72113 100644
--- a/media/libwebp/dsp/rescaler_neon.c
+++ b/media/libwebp/dsp/rescaler_neon.c
@@ -81,14 +81,13 @@ static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
const uint32x4_t B1 = MULT_FIX(A1, fy_scale_half);
const uint16x4_t C0 = vmovn_u32(B0);
const uint16x4_t C1 = vmovn_u32(B1);
- const uint8x8_t D = vmovn_u16(vcombine_u16(C0, C1));
+ const uint8x8_t D = vqmovn_u16(vcombine_u16(C0, C1));
vst1_u8(dst + x_out, D);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX_C(J, fy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
@@ -102,7 +101,7 @@ static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
const uint16x4_t E0 = vmovn_u32(D0);
const uint16x4_t E1 = vmovn_u32(D1);
- const uint8x8_t F = vmovn_u16(vcombine_u16(E0, E1));
+ const uint8x8_t F = vqmovn_u16(vcombine_u16(E0, E1));
vst1_u8(dst + x_out, F);
}
for (; x_out < x_out_max; ++x_out) {
@@ -110,8 +109,7 @@ static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX_C(J, fy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
}
}
@@ -135,23 +133,22 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
for (x_out = 0; x_out < max_span; x_out += 8) {
LOAD_32x8(frow + x_out, in0, in1);
LOAD_32x8(irow + x_out, in2, in3);
- const uint32x4_t A0 = MULT_FIX(in0, yscale_half);
- const uint32x4_t A1 = MULT_FIX(in1, yscale_half);
+ const uint32x4_t A0 = MULT_FIX_FLOOR(in0, yscale_half);
+ const uint32x4_t A1 = MULT_FIX_FLOOR(in1, yscale_half);
const uint32x4_t B0 = vqsubq_u32(in2, A0);
const uint32x4_t B1 = vqsubq_u32(in3, A1);
- const uint32x4_t C0 = MULT_FIX_FLOOR(B0, fxy_scale_half);
- const uint32x4_t C1 = MULT_FIX_FLOOR(B1, fxy_scale_half);
+ const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half);
+ const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half);
const uint16x4_t D0 = vmovn_u32(C0);
const uint16x4_t D1 = vmovn_u32(C1);
- const uint8x8_t E = vmovn_u16(vcombine_u16(D0, D1));
+ const uint8x8_t E = vqmovn_u16(vcombine_u16(D0, D1));
vst1_u8(dst + x_out, E);
STORE_32x8(A0, A1, irow + x_out);
}
for (; x_out < x_out_max; ++x_out) {
- const uint32_t frac = (uint32_t)MULT_FIX_C(frow[x_out], yscale);
- const int v = (int)MULT_FIX_FLOOR_C(irow[x_out] - frac, fxy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ const uint32_t frac = (uint32_t)MULT_FIX_FLOOR_C(frow[x_out], yscale);
+ const int v = (int)MULT_FIX_C(irow[x_out] - frac, fxy_scale);
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = frac; // new fractional start
}
} else {
@@ -161,14 +158,13 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
const uint32x4_t A1 = MULT_FIX(in1, fxy_scale_half);
const uint16x4_t B0 = vmovn_u32(A0);
const uint16x4_t B1 = vmovn_u32(A1);
- const uint8x8_t C = vmovn_u16(vcombine_u16(B0, B1));
+ const uint8x8_t C = vqmovn_u16(vcombine_u16(B0, B1));
vst1_u8(dst + x_out, C);
STORE_32x8(zero, zero, irow + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = 0;
}
}
diff --git a/media/libwebp/dsp/rescaler_sse2.c b/media/libwebp/dsp/rescaler_sse2.c
index 2d35f76ab0..237997c808 100644
--- a/media/libwebp/dsp/rescaler_sse2.c
+++ b/media/libwebp/dsp/rescaler_sse2.c
@@ -225,35 +225,6 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
_mm_storel_epi64((__m128i*)dst, G);
}
-static WEBP_INLINE void ProcessRow_Floor_SSE2(const __m128i* const A0,
- const __m128i* const A1,
- const __m128i* const A2,
- const __m128i* const A3,
- const __m128i* const mult,
- uint8_t* const dst) {
- const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
- const __m128i B0 = _mm_mul_epu32(*A0, *mult);
- const __m128i B1 = _mm_mul_epu32(*A1, *mult);
- const __m128i B2 = _mm_mul_epu32(*A2, *mult);
- const __m128i B3 = _mm_mul_epu32(*A3, *mult);
- const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX);
- const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
-#if (WEBP_RESCALER_RFIX < 32)
- const __m128i D2 =
- _mm_and_si128(_mm_slli_epi64(B2, 32 - WEBP_RESCALER_RFIX), mask);
- const __m128i D3 =
- _mm_and_si128(_mm_slli_epi64(B3, 32 - WEBP_RESCALER_RFIX), mask);
-#else
- const __m128i D2 = _mm_and_si128(B2, mask);
- const __m128i D3 = _mm_and_si128(B3, mask);
-#endif
- const __m128i E0 = _mm_or_si128(D0, D2);
- const __m128i E1 = _mm_or_si128(D1, D3);
- const __m128i F = _mm_packs_epi32(E0, E1);
- const __m128i G = _mm_packus_epi16(F, F);
- _mm_storel_epi64((__m128i*)dst, G);
-}
-
static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
@@ -274,8 +245,7 @@ static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
for (; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX(J, wrk->fy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
@@ -308,8 +278,7 @@ static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX(J, wrk->fy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
}
}
@@ -328,20 +297,15 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
const int scale_xy = wrk->fxy_scale;
const __m128i mult_xy = _mm_set_epi32(0, scale_xy, 0, scale_xy);
const __m128i mult_y = _mm_set_epi32(0, yscale, 0, yscale);
- const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
{
- const __m128i C0 = _mm_add_epi64(B0, rounder);
- const __m128i C1 = _mm_add_epi64(B1, rounder);
- const __m128i C2 = _mm_add_epi64(B2, rounder);
- const __m128i C3 = _mm_add_epi64(B3, rounder);
- const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX); // = frac
- const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
- const __m128i D2 = _mm_srli_epi64(C2, WEBP_RESCALER_RFIX);
- const __m128i D3 = _mm_srli_epi64(C3, WEBP_RESCALER_RFIX);
+ const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX); // = frac
+ const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
+ const __m128i D2 = _mm_srli_epi64(B2, WEBP_RESCALER_RFIX);
+ const __m128i D3 = _mm_srli_epi64(B3, WEBP_RESCALER_RFIX);
const __m128i E0 = _mm_sub_epi64(A0, D0); // irow[x] - frac
const __m128i E1 = _mm_sub_epi64(A1, D1);
const __m128i E2 = _mm_sub_epi64(A2, D2);
@@ -352,14 +316,13 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
const __m128i G1 = _mm_or_si128(D1, F3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
- ProcessRow_Floor_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
+ ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
- const uint32_t frac = (int)MULT_FIX(frow[x_out], yscale);
- const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ const uint32_t frac = (int)MULT_FIX_FLOOR(frow[x_out], yscale);
+ const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = frac; // new fractional start
}
} else {
@@ -375,8 +338,7 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
}
for (; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], scale);
- assert(v >= 0 && v <= 255);
- dst[x_out] = v;
+ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = 0;
}
}
diff --git a/media/libwebp/dsp/ssim.c b/media/libwebp/dsp/ssim.c
new file mode 100644
index 0000000000..4141e32fd8
--- /dev/null
+++ b/media/libwebp/dsp/ssim.c
@@ -0,0 +1,159 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h> // for abs()
+
+#include "../dsp/dsp.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR
+
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+ 1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+ const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
+ const uint32_t w2 = N * N;
+ const uint32_t C1 = 20 * w2;
+ const uint32_t C2 = 60 * w2;
+ const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
+ const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+ const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+ if (xmxm + ymym >= C3) {
+ const int64_t xmym = (int64_t)stats->xm * stats->ym;
+ const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
+ const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+ const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+ // we descale by 8 to prevent overflow during the fnum/fden multiply.
+ const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+ const uint64_t den_S = (sxx + syy + C2) >> 8;
+ const uint64_t fnum = (2 * xmym + C1) * num_S;
+ const uint64_t fden = (xmxm + ymym + C1) * den_S;
+ const double r = (double)fnum / fden;
+ assert(r >= 0. && r <= 1.0);
+ return r;
+ }
+ return 1.; // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+ return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+ return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+ const uint8_t* src2, int stride2,
+ int xo, int yo, int W, int H) {
+ VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+ const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+ const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+ : yo + VP8_SSIM_KERNEL;
+ const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+ const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+ : xo + VP8_SSIM_KERNEL;
+ int x, y;
+ src1 += ymin * stride1;
+ src2 += ymin * stride2;
+ for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+ for (x = xmin; x <= xmax; ++x) {
+ const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+ * kWeight[VP8_SSIM_KERNEL + y - yo];
+ const uint32_t s1 = src1[x];
+ const uint32_t s2 = src2[x];
+ stats.w += w;
+ stats.xm += w * s1;
+ stats.ym += w * s2;
+ stats.xxm += w * s1 * s1;
+ stats.xym += w * s1 * s2;
+ stats.yym += w * s2 * s2;
+ }
+ }
+ return VP8SSIMFromStatsClipped(&stats);
+}
+
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+ const uint8_t* src2, int stride2) {
+ VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+ int x, y;
+ for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+ for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+ const uint32_t w = kWeight[x] * kWeight[y];
+ const uint32_t s1 = src1[x];
+ const uint32_t s2 = src2[x];
+ stats.xm += w * s1;
+ stats.ym += w * s2;
+ stats.xxm += w * s1 * s1;
+ stats.xym += w * s1 * s2;
+ stats.yym += w * s2 * s2;
+ }
+ }
+ return VP8SSIMFromStats(&stats);
+}
+
+#endif // !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_DISABLE_STATS)
+static uint32_t AccumulateSSE_C(const uint8_t* src1,
+ const uint8_t* src2, int len) {
+ int i;
+ uint32_t sse2 = 0;
+ assert(len <= 65535); // to ensure that accumulation fits within uint32_t
+ for (i = 0; i < len; ++i) {
+ const int32_t diff = src1[i] - src2[i];
+ sse2 += diff * diff;
+ }
+ return sse2;
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_REDUCE_SIZE)
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+#endif
+#if !defined(WEBP_DISABLE_STATS)
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
+#if !defined(WEBP_REDUCE_SIZE)
+ VP8SSIMGetClipped = SSIMGetClipped_C;
+ VP8SSIMGet = SSIMGet_C;
+#endif
+
+#if !defined(WEBP_DISABLE_STATS)
+ VP8AccumulateSSE = AccumulateSSE_C;
+#endif
+
+ if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+ if (VP8GetCPUInfo(kSSE2)) {
+ VP8SSIMDspInitSSE2();
+ }
+#endif
+ }
+}
diff --git a/media/libwebp/dsp/ssim_sse2.c b/media/libwebp/dsp/ssim_sse2.c
new file mode 100644
index 0000000000..89810a77c8
--- /dev/null
+++ b/media/libwebp/dsp/ssim_sse2.c
@@ -0,0 +1,165 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "../dsp/common_sse2.h"
+
+#if !defined(WEBP_DISABLE_STATS)
+
+// Helper function
+static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
+ __m128i* const sum) {
+ // take abs(a-b) in 8b
+ const __m128i a_b = _mm_subs_epu8(a, b);
+ const __m128i b_a = _mm_subs_epu8(b, a);
+ const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+ // zero-extend to 16b
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+ const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+ // multiply with self
+ const __m128i sum1 = _mm_madd_epi16(C0, C0);
+ const __m128i sum2 = _mm_madd_epi16(C1, C1);
+ *sum = _mm_add_epi32(sum1, sum2);
+}
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+ const uint8_t* src2, int len) {
+ int i = 0;
+ uint32_t sse2 = 0;
+ if (len >= 16) {
+ const int limit = len - 32;
+ int32_t tmp[4];
+ __m128i sum1;
+ __m128i sum = _mm_setzero_si128();
+ __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+ __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+ i += 16;
+ while (i <= limit) {
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+ const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+ __m128i sum2;
+ i += 16;
+ SubtractAndSquare_SSE2(a0, b0, &sum1);
+ sum = _mm_add_epi32(sum, sum1);
+ a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+ b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+ i += 16;
+ SubtractAndSquare_SSE2(a1, b1, &sum2);
+ sum = _mm_add_epi32(sum, sum2);
+ }
+ SubtractAndSquare_SSE2(a0, b0, &sum1);
+ sum = _mm_add_epi32(sum, sum1);
+ _mm_storeu_si128((__m128i*)tmp, sum);
+ sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+ }
+
+ for (; i < len; ++i) {
+ const int32_t diff = src1[i] - src2[i];
+ sse2 += diff * diff;
+ }
+ return sse2;
+}
+#endif // !defined(WEBP_DISABLE_STATS)
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
+ uint16_t tmp[8];
+ const __m128i a = _mm_srli_si128(*m, 8);
+ const __m128i b = _mm_add_epi16(*m, a);
+ _mm_storeu_si128((__m128i*)tmp, b);
+ return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
+ const __m128i a = _mm_srli_si128(*m, 8);
+ const __m128i b = _mm_add_epi32(*m, a);
+ const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+ return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do { \
+ /* compute row weight (Wx * Wy) */ \
+ const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
+ const __m128i W = _mm_mullo_epi16(Wx, Wy); \
+ /* process 8 bytes at a time (7 bytes, actually) */ \
+ const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+ const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+ /* convert to 16b and multiply by weight */ \
+ const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
+ const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
+ const __m128i wa1 = _mm_mullo_epi16(a1, W); \
+ const __m128i wb1 = _mm_mullo_epi16(b1, W); \
+ /* accumulate */ \
+ xm = _mm_add_epi16(xm, wa1); \
+ ym = _mm_add_epi16(ym, wb1); \
+ xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
+ xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
+ yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
+ src1 += stride1; \
+ src2 += stride2; \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+ const uint8_t* src2, int stride2) {
+ VP8DistoStats stats;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i xm = zero, ym = zero; // 16b accums
+ __m128i xxm = zero, yym = zero, xym = zero; // 32b accum
+ const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+ assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+ ACCUMULATE_ROW(1);
+ ACCUMULATE_ROW(2);
+ ACCUMULATE_ROW(3);
+ ACCUMULATE_ROW(4);
+ ACCUMULATE_ROW(3);
+ ACCUMULATE_ROW(2);
+ ACCUMULATE_ROW(1);
+ stats.xm = HorizontalAdd16b_SSE2(&xm);
+ stats.ym = HorizontalAdd16b_SSE2(&ym);
+ stats.xxm = HorizontalAdd32b_SSE2(&xxm);
+ stats.xym = HorizontalAdd32b_SSE2(&xym);
+ stats.yym = HorizontalAdd32b_SSE2(&yym);
+ return VP8SSIMFromStats(&stats);
+}
+
+#endif // !defined(WEBP_REDUCE_SIZE)
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+#if !defined(WEBP_DISABLE_STATS)
+ VP8AccumulateSSE = AccumulateSSE_SSE2;
+#endif
+#if !defined(WEBP_REDUCE_SIZE)
+ VP8SSIMGet = SSIMGet_SSE2;
+#endif
+}
+
+#else // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
+
+#endif // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/upsampling.c b/media/libwebp/dsp/upsampling.c
index b76483a3a6..aa1c807e89 100644
--- a/media/libwebp/dsp/upsampling.c
+++ b/media/libwebp/dsp/upsampling.c
@@ -233,12 +233,12 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitYUV444ConvertersSSE2();
}
#endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitYUV444ConvertersSSE41();
}
@@ -278,12 +278,12 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitUpsamplersSSE2();
}
#endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitUpsamplersSSE41();
}
@@ -300,7 +300,7 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitUpsamplersNEON();
diff --git a/media/libwebp/dsp/upsampling_mips_dsp_r2.c b/media/libwebp/dsp/upsampling_mips_dsp_r2.c
new file mode 100644
index 0000000000..2789c29e02
--- /dev/null
+++ b/media/libwebp/dsp/upsampling_mips_dsp_r2.c
@@ -0,0 +1,291 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV to RGB upsampling functions.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+// Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include <assert.h>
+#include "../dsp/yuv.h"
+
+#define YUV_TO_RGB(Y, U, V, R, G, B) do { \
+ const int t1 = MultHi(Y, 19077); \
+ const int t2 = MultHi(V, 13320); \
+ R = MultHi(V, 26149); \
+ G = MultHi(U, 6419); \
+ B = MultHi(U, 33050); \
+ R = t1 + R; \
+ G = t1 - G; \
+ B = t1 + B; \
+ R = R - 14234; \
+ G = G - t2 + 8708; \
+ B = B - 17685; \
+ __asm__ volatile ( \
+ "shll_s.w %[" #R "], %[" #R "], 17 \n\t" \
+ "shll_s.w %[" #G "], %[" #G "], 17 \n\t" \
+ "shll_s.w %[" #B "], %[" #B "], 17 \n\t" \
+ "precrqu_s.qb.ph %[" #R "], %[" #R "], $zero \n\t" \
+ "precrqu_s.qb.ph %[" #G "], %[" #G "], $zero \n\t" \
+ "precrqu_s.qb.ph %[" #B "], %[" #B "], $zero \n\t" \
+ "srl %[" #R "], %[" #R "], 24 \n\t" \
+ "srl %[" #G "], %[" #G "], 24 \n\t" \
+ "srl %[" #B "], %[" #B "], 24 \n\t" \
+ : [R]"+r"(R), [G]"+r"(G), [B]"+r"(B) \
+ : \
+ ); \
+ } while (0)
+
+#if !defined(WEBP_REDUCE_CSP)
+static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
+ int r, g, b;
+ YUV_TO_RGB(y, u, v, r, g, b);
+ rgb[0] = r;
+ rgb[1] = g;
+ rgb[2] = b;
+}
+static WEBP_INLINE void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
+ int r, g, b;
+ YUV_TO_RGB(y, u, v, r, g, b);
+ bgr[0] = b;
+ bgr[1] = g;
+ bgr[2] = r;
+}
+static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
+ int r, g, b;
+ YUV_TO_RGB(y, u, v, r, g, b);
+ {
+ const int rg = (r & 0xf8) | (g >> 5);
+ const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ rgb[0] = gb;
+ rgb[1] = rg;
+#else
+ rgb[0] = rg;
+ rgb[1] = gb;
+#endif
+ }
+}
+static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
+ uint8_t* const argb) {
+ int r, g, b;
+ YUV_TO_RGB(y, u, v, r, g, b);
+ {
+ const int rg = (r & 0xf0) | (g >> 4);
+ const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ argb[0] = ba;
+ argb[1] = rg;
+#else
+ argb[0] = rg;
+ argb[1] = ba;
+#endif
+ }
+}
+#endif // WEBP_REDUCE_CSP
+
+//-----------------------------------------------------------------------------
+// Alpha handling variants
+
+#if !defined(WEBP_REDUCE_CSP)
+static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+ uint8_t* const argb) {
+ int r, g, b;
+ YUV_TO_RGB(y, u, v, r, g, b);
+ argb[0] = 0xff;
+ argb[1] = r;
+ argb[2] = g;
+ argb[3] = b;
+}
+#endif // WEBP_REDUCE_CSP
+static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+ uint8_t* const bgra) {
+ int r, g, b;
+ YUV_TO_RGB(y, u, v, r, g, b);
+ bgra[0] = b;
+ bgra[1] = g;
+ bgra[2] = r;
+ bgra[3] = 0xff;
+}
+static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+ uint8_t* const rgba) {
+ int r, g, b;
+ YUV_TO_RGB(y, u, v, r, g, b);
+ rgba[0] = r;
+ rgba[1] = g;
+ rgba[2] = b;
+ rgba[3] = 0xff;
+}
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Given samples laid out in a square as:
+// [a b]
+// [c d]
+// we interpolate u/v as:
+// ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16
+// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u, v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
+ const uint8_t* top_u, const uint8_t* top_v, \
+ const uint8_t* cur_u, const uint8_t* cur_v, \
+ uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
+ int x; \
+ const int last_pixel_pair = (len - 1) >> 1; \
+ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
+ uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \
+ assert(top_y != NULL); \
+ { \
+ const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
+ FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
+ } \
+ if (bottom_y != NULL) { \
+ const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
+ FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \
+ } \
+ for (x = 1; x <= last_pixel_pair; ++x) { \
+ const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \
+ const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \
+ /* precompute invariant values associated with first and second diagonals*/\
+ const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \
+ const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \
+ const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \
+ { \
+ const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
+ const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
+ FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
+ top_dst + (2 * x - 1) * XSTEP); \
+ FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
+ top_dst + (2 * x - 0) * XSTEP); \
+ } \
+ if (bottom_y != NULL) { \
+ const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
+ const uint32_t uv1 = (diag_12 + uv) >> 1; \
+ FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
+ bottom_dst + (2 * x - 1) * XSTEP); \
+ FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
+ bottom_dst + (2 * x + 0) * XSTEP); \
+ } \
+ tl_uv = t_uv; \
+ l_uv = uv; \
+ } \
+ if (!(len & 1)) { \
+ { \
+ const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
+ FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
+ top_dst + (len - 1) * XSTEP); \
+ } \
+ if (bottom_y != NULL) { \
+ const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
+ FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
+ bottom_dst + (len - 1) * XSTEP); \
+ } \
+ } \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
+UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
+#endif // WEBP_REDUCE_CSP
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitUpsamplersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
+ WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
+ WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+ WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
+ WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+ WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
+ WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
+ WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
+ WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+ WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
+ WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
+ WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif // WEBP_REDUCE_CSP
+}
+
+#endif // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
+ uint8_t* dst, int len) { \
+ int i; \
+ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
+}
+
+YUV444_FUNC(Yuv444ToRgba, YuvToRgba, 4)
+YUV444_FUNC(Yuv444ToBgra, YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
+YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
+YUV444_FUNC(Yuv444ToArgb, YuvToArgb, 4)
+YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
+YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
+#endif // WEBP_REDUCE_CSP
+
+#undef YUV444_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitYUV444ConvertersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
+ WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
+ WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
+ WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
+ WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
+#if !defined(WEBP_REDUCE_CSP)
+ WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
+ WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
+ WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb;
+ WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
+ WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565;
+ WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb;
+ WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
+#endif // WEBP_REDUCE_CSP
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersMIPSdspR2)
+#endif
diff --git a/media/libwebp/dsp/upsampling_msa.c b/media/libwebp/dsp/upsampling_msa.c
new file mode 100644
index 0000000000..d8ef6feb58
--- /dev/null
+++ b/media/libwebp/dsp/upsampling_msa.c
@@ -0,0 +1,688 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of YUV to RGB upsampling functions.
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include <string.h>
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/msa_macro.h"
+#include "../dsp/yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+#define ILVR_UW2(in, out0, out1) do { \
+ const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in); \
+ out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0); \
+ out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0); \
+} while (0)
+
+#define ILVRL_UW4(in, out0, out1, out2, out3) do { \
+ v16u8 t0, t1; \
+ ILVRL_B2_UB(zero, in, t0, t1); \
+ ILVRL_H2_UW(zero, t0, out0, out1); \
+ ILVRL_H2_UW(zero, t1, out2, out3); \
+} while (0)
+
+#define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do { \
+ const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \
+ v4u32 temp0, temp1, temp2, temp3; \
+ MUL4(in0, const0, in1, const0, in2, const0, in3, const0, \
+ temp0, temp1, temp2, temp3); \
+ PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1); \
+} while (0)
+
+#define MULTHI_8(in0, in1, cnst, out0) do { \
+ const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \
+ v4u32 temp0, temp1; \
+ MUL2(in0, const0, in1, const0, temp0, temp1); \
+ out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0); \
+} while (0)
+
+#define CALC_R16(y0, y1, v0, v1, dst) do { \
+ const v8i16 const_a = (v8i16)__msa_fill_h(14234); \
+ const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \
+ const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1); \
+ v8i16 b0 = __msa_subs_s_h(a0, const_a); \
+ v8i16 b1 = __msa_subs_s_h(a1, const_a); \
+ SRAI_H2_SH(b0, b1, 6); \
+ CLIP_SH2_0_255(b0, b1); \
+ dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \
+} while (0)
+
+#define CALC_R8(y0, v0, dst) do { \
+ const v8i16 const_a = (v8i16)__msa_fill_h(14234); \
+ const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \
+ v8i16 b0 = __msa_subs_s_h(a0, const_a); \
+ b0 = SRAI_H(b0, 6); \
+ CLIP_SH_0_255(b0); \
+ dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \
+} while (0)
+
+#define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do { \
+ const v8i16 const_a = (v8i16)__msa_fill_h(8708); \
+ v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \
+ v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1); \
+ const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \
+ const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1); \
+ a0 = __msa_adds_s_h(b0, const_a); \
+ a1 = __msa_adds_s_h(b1, const_a); \
+ SRAI_H2_SH(a0, a1, 6); \
+ CLIP_SH2_0_255(a0, a1); \
+ dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0); \
+} while (0)
+
+#define CALC_G8(y0, u0, v0, dst) do { \
+ const v8i16 const_a = (v8i16)__msa_fill_h(8708); \
+ v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \
+ const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \
+ a0 = __msa_adds_s_h(b0, const_a); \
+ a0 = SRAI_H(a0, 6); \
+ CLIP_SH_0_255(a0); \
+ dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0); \
+} while (0)
+
+#define CALC_B16(y0, y1, u0, u1, dst) do { \
+ const v8u16 const_a = (v8u16)__msa_fill_h(17685); \
+ const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \
+ const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1); \
+ v8u16 b0 = __msa_subs_u_h(a0, const_a); \
+ v8u16 b1 = __msa_subs_u_h(a1, const_a); \
+ SRAI_H2_UH(b0, b1, 6); \
+ CLIP_UH2_0_255(b0, b1); \
+ dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \
+} while (0)
+
+#define CALC_B8(y0, u0, dst) do { \
+ const v8u16 const_a = (v8u16)__msa_fill_h(17685); \
+ const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \
+ v8u16 b0 = __msa_subs_u_h(a0, const_a); \
+ b0 = SRAI_H(b0, 6); \
+ CLIP_UH_0_255(b0); \
+ dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \
+} while (0)
+
+#define CALC_RGB16(y, u, v, R, G, B) do { \
+ const v16u8 zero = { 0 }; \
+ v8u16 y0, y1, u0, u1, v0, v1; \
+ v4u32 p0, p1, p2, p3; \
+ const v16u8 in_y = LD_UB(y); \
+ const v16u8 in_u = LD_UB(u); \
+ const v16u8 in_v = LD_UB(v); \
+ ILVRL_UW4(in_y, p0, p1, p2, p3); \
+ MULTHI_16(p0, p1, p2, p3, 19077, y0, y1); \
+ ILVRL_UW4(in_v, p0, p1, p2, p3); \
+ MULTHI_16(p0, p1, p2, p3, 26149, v0, v1); \
+ CALC_R16(y0, y1, v0, v1, R); \
+ MULTHI_16(p0, p1, p2, p3, 13320, v0, v1); \
+ ILVRL_UW4(in_u, p0, p1, p2, p3); \
+ MULTHI_16(p0, p1, p2, p3, 6419, u0, u1); \
+ CALC_G16(y0, y1, u0, u1, v0, v1, G); \
+ MULTHI_16(p0, p1, p2, p3, 33050, u0, u1); \
+ CALC_B16(y0, y1, u0, u1, B); \
+} while (0)
+
+#define CALC_RGB8(y, u, v, R, G, B) do { \
+ const v16u8 zero = { 0 }; \
+ v8u16 y0, u0, v0; \
+ v4u32 p0, p1; \
+ const v16u8 in_y = LD_UB(y); \
+ const v16u8 in_u = LD_UB(u); \
+ const v16u8 in_v = LD_UB(v); \
+ ILVR_UW2(in_y, p0, p1); \
+ MULTHI_8(p0, p1, 19077, y0); \
+ ILVR_UW2(in_v, p0, p1); \
+ MULTHI_8(p0, p1, 26149, v0); \
+ CALC_R8(y0, v0, R); \
+ MULTHI_8(p0, p1, 13320, v0); \
+ ILVR_UW2(in_u, p0, p1); \
+ MULTHI_8(p0, p1, 6419, u0); \
+ CALC_G8(y0, u0, v0, G); \
+ MULTHI_8(p0, p1, 33050, u0); \
+ CALC_B8(y0, u0, B); \
+} while (0)
+
+#define STORE16_3(a0, a1, a2, dst) do { \
+ const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \
+ 8, 9, 20, 10 }; \
+ const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, \
+ 8, 25, 9, 10 }; \
+ const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7, \
+ 30, 8, 9, 31 }; \
+ v16u8 out0, out1, out2, tmp0, tmp1, tmp2; \
+ ILVRL_B2_UB(a1, a0, tmp0, tmp1); \
+ out0 = VSHF_UB(tmp0, a2, mask0); \
+ tmp2 = SLDI_UB(tmp1, tmp0, 11); \
+ out1 = VSHF_UB(tmp2, a2, mask1); \
+ tmp2 = SLDI_UB(tmp1, tmp1, 6); \
+ out2 = VSHF_UB(tmp2, a2, mask2); \
+ ST_UB(out0, dst + 0); \
+ ST_UB(out1, dst + 16); \
+ ST_UB(out2, dst + 32); \
+} while (0)
+
+#define STORE8_3(a0, a1, a2, dst) do { \
+ int64_t out_m; \
+ const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \
+ 8, 9, 20, 10 }; \
+ const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23, \
+ 255, 255, 255, 255, 255, 255, 255, 255 }; \
+ const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \
+ v16u8 out0, out1; \
+ VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1); \
+ ST_UB(out0, dst); \
+ out_m = __msa_copy_s_d((v2i64)out1, 0); \
+ SD(out_m, dst + 16); \
+} while (0)
+
+#define STORE16_4(a0, a1, a2, a3, dst) do { \
+ v16u8 tmp0, tmp1, tmp2, tmp3; \
+ v16u8 out0, out1, out2, out3; \
+ ILVRL_B2_UB(a1, a0, tmp0, tmp1); \
+ ILVRL_B2_UB(a3, a2, tmp2, tmp3); \
+ ILVRL_H2_UB(tmp2, tmp0, out0, out1); \
+ ILVRL_H2_UB(tmp3, tmp1, out2, out3); \
+ ST_UB(out0, dst + 0); \
+ ST_UB(out1, dst + 16); \
+ ST_UB(out2, dst + 32); \
+ ST_UB(out3, dst + 48); \
+} while (0)
+
+#define STORE8_4(a0, a1, a2, a3, dst) do { \
+ v16u8 tmp0, tmp1, tmp2, tmp3; \
+ ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1); \
+ ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3); \
+ ST_UB(tmp2, dst + 0); \
+ ST_UB(tmp3, dst + 16); \
+} while (0)
+
+#define STORE2_16(a0, a1, dst) do { \
+ v16u8 out0, out1; \
+ ILVRL_B2_UB(a1, a0, out0, out1); \
+ ST_UB(out0, dst + 0); \
+ ST_UB(out1, dst + 16); \
+} while (0)
+
+#define STORE2_8(a0, a1, dst) do { \
+ const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \
+ ST_UB(out0, dst); \
+} while (0)
+
+#define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do { \
+ CALC_RGB##N(y, u, v, R, G, B); \
+ tmp0 = ANDI_B(R, 0xf0); \
+ tmp1 = SRAI_B(G, 4); \
+ RG = tmp0 | tmp1; \
+ tmp0 = ANDI_B(B, 0xf0); \
+ BA = ORI_B(tmp0, 0x0f); \
+ STORE2_##N(out0, out1, dst); \
+} while (0)
+
+#define CALC_RGB565(y, u, v, out0, out1, N, dst) do { \
+ CALC_RGB##N(y, u, v, R, G, B); \
+ tmp0 = ANDI_B(R, 0xf8); \
+ tmp1 = SRAI_B(G, 5); \
+ RG = tmp0 | tmp1; \
+ tmp0 = SLLI_B(G, 3); \
+ tmp1 = ANDI_B(tmp0, 0xe0); \
+ tmp0 = SRAI_B(B, 3); \
+ GB = tmp0 | tmp1; \
+ STORE2_##N(out0, out1, dst); \
+} while (0)
+
+static WEBP_INLINE int Clip8(int v) {
+ return v < 0 ? 0 : v > 255 ? 255 : v;
+}
+
+static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
+ const int y1 = MultHi(y, 19077);
+ const int r1 = y1 + MultHi(v, 26149) - 14234;
+ const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+ const int b1 = y1 + MultHi(u, 33050) - 17685;
+ rgb[0] = Clip8(r1 >> 6);
+ rgb[1] = Clip8(g1 >> 6);
+ rgb[2] = Clip8(b1 >> 6);
+}
+
+static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
+ const int y1 = MultHi(y, 19077);
+ const int r1 = y1 + MultHi(v, 26149) - 14234;
+ const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+ const int b1 = y1 + MultHi(u, 33050) - 17685;
+ bgr[0] = Clip8(b1 >> 6);
+ bgr[1] = Clip8(g1 >> 6);
+ bgr[2] = Clip8(r1 >> 6);
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
+ const int y1 = MultHi(y, 19077);
+ const int r1 = y1 + MultHi(v, 26149) - 14234;
+ const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+ const int b1 = y1 + MultHi(u, 33050) - 17685;
+ const int r = Clip8(r1 >> 6);
+ const int g = Clip8(g1 >> 6);
+ const int b = Clip8(b1 >> 6);
+ const int rg = (r & 0xf8) | (g >> 5);
+ const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ rgb[0] = gb;
+ rgb[1] = rg;
+#else
+ rgb[0] = rg;
+ rgb[1] = gb;
+#endif
+}
+
+static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
+ const int y1 = MultHi(y, 19077);
+ const int r1 = y1 + MultHi(v, 26149) - 14234;
+ const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+ const int b1 = y1 + MultHi(u, 33050) - 17685;
+ const int r = Clip8(r1 >> 6);
+ const int g = Clip8(g1 >> 6);
+ const int b = Clip8(b1 >> 6);
+ const int rg = (r & 0xf0) | (g >> 4);
+ const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ argb[0] = ba;
+ argb[1] = rg;
+#else
+ argb[0] = rg;
+ argb[1] = ba;
+#endif
+}
+
+static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
+ argb[0] = 0xff;
+ YuvToRgb(y, u, v, argb + 1);
+}
+#endif // WEBP_REDUCE_CSP
+
+static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
+ YuvToBgr(y, u, v, bgra);
+ bgra[3] = 0xff;
+}
+
+static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
+ YuvToRgb(y, u, v, rgba);
+ rgba[3] = 0xff;
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
+ const uint8_t* v, uint8_t* dst, int length) {
+ v16u8 R, G, B;
+ while (length >= 16) {
+ CALC_RGB16(y, u, v, R, G, B);
+ STORE16_3(R, G, B, dst);
+ y += 16;
+ u += 16;
+ v += 16;
+ dst += 16 * 3;
+ length -= 16;
+ }
+ if (length > 8) {
+ uint8_t temp[3 * 16] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB16(temp, u, v, R, G, B);
+ STORE16_3(R, G, B, temp);
+ memcpy(dst, temp, length * 3 * sizeof(*dst));
+ } else if (length > 0) {
+ uint8_t temp[3 * 8] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB8(temp, u, v, R, G, B);
+ STORE8_3(R, G, B, temp);
+ memcpy(dst, temp, length * 3 * sizeof(*dst));
+ }
+}
+
+static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
+ const uint8_t* v, uint8_t* dst, int length) {
+ v16u8 R, G, B;
+ while (length >= 16) {
+ CALC_RGB16(y, u, v, R, G, B);
+ STORE16_3(B, G, R, dst);
+ y += 16;
+ u += 16;
+ v += 16;
+ dst += 16 * 3;
+ length -= 16;
+ }
+ if (length > 8) {
+ uint8_t temp[3 * 16] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB16(temp, u, v, R, G, B);
+ STORE16_3(B, G, R, temp);
+ memcpy(dst, temp, length * 3 * sizeof(*dst));
+ } else if (length > 0) {
+ uint8_t temp[3 * 8] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB8(temp, u, v, R, G, B);
+ STORE8_3(B, G, R, temp);
+ memcpy(dst, temp, length * 3 * sizeof(*dst));
+ }
+}
+#endif // WEBP_REDUCE_CSP
+
+static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
+ const uint8_t* v, uint8_t* dst, int length) {
+ v16u8 R, G, B;
+ const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+ while (length >= 16) {
+ CALC_RGB16(y, u, v, R, G, B);
+ STORE16_4(R, G, B, A, dst);
+ y += 16;
+ u += 16;
+ v += 16;
+ dst += 16 * 4;
+ length -= 16;
+ }
+ if (length > 8) {
+ uint8_t temp[4 * 16] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB16(&temp[0], u, v, R, G, B);
+ STORE16_4(R, G, B, A, temp);
+ memcpy(dst, temp, length * 4 * sizeof(*dst));
+ } else if (length > 0) {
+ uint8_t temp[4 * 8] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB8(temp, u, v, R, G, B);
+ STORE8_4(R, G, B, A, temp);
+ memcpy(dst, temp, length * 4 * sizeof(*dst));
+ }
+}
+
+static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
+ const uint8_t* v, uint8_t* dst, int length) {
+ v16u8 R, G, B;
+ const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+ while (length >= 16) {
+ CALC_RGB16(y, u, v, R, G, B);
+ STORE16_4(B, G, R, A, dst);
+ y += 16;
+ u += 16;
+ v += 16;
+ dst += 16 * 4;
+ length -= 16;
+ }
+ if (length > 8) {
+ uint8_t temp[4 * 16] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB16(temp, u, v, R, G, B);
+ STORE16_4(B, G, R, A, temp);
+ memcpy(dst, temp, length * 4 * sizeof(*dst));
+ } else if (length > 0) {
+ uint8_t temp[4 * 8] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB8(temp, u, v, R, G, B);
+ STORE8_4(B, G, R, A, temp);
+ memcpy(dst, temp, length * 4 * sizeof(*dst));
+ }
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
+ const uint8_t* v, uint8_t* dst, int length) {
+ v16u8 R, G, B;
+ const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+ while (length >= 16) {
+ CALC_RGB16(y, u, v, R, G, B);
+ STORE16_4(A, R, G, B, dst);
+ y += 16;
+ u += 16;
+ v += 16;
+ dst += 16 * 4;
+ length -= 16;
+ }
+ if (length > 8) {
+ uint8_t temp[4 * 16] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB16(temp, u, v, R, G, B);
+ STORE16_4(A, R, G, B, temp);
+ memcpy(dst, temp, length * 4 * sizeof(*dst));
+ } else if (length > 0) {
+ uint8_t temp[4 * 8] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+ CALC_RGB8(temp, u, v, R, G, B);
+ STORE8_4(A, R, G, B, temp);
+ memcpy(dst, temp, length * 4 * sizeof(*dst));
+ }
+}
+
+static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
+ const uint8_t* v, uint8_t* dst, int length) {
+ v16u8 R, G, B, RG, BA, tmp0, tmp1;
+ while (length >= 16) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
+#else
+ CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
+#endif
+ y += 16;
+ u += 16;
+ v += 16;
+ dst += 16 * 2;
+ length -= 16;
+ }
+ if (length > 8) {
+ uint8_t temp[2 * 16] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
+#else
+ CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
+#endif
+ memcpy(dst, temp, length * 2 * sizeof(*dst));
+ } else if (length > 0) {
+ uint8_t temp[2 * 8] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
+#else
+ CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
+#endif
+ memcpy(dst, temp, length * 2 * sizeof(*dst));
+ }
+}
+
+static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
+ const uint8_t* v, uint8_t* dst, int length) {
+ v16u8 R, G, B, RG, GB, tmp0, tmp1;
+ while (length >= 16) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ CALC_RGB565(y, u, v, GB, RG, 16, dst);
+#else
+ CALC_RGB565(y, u, v, RG, GB, 16, dst);
+#endif
+ y += 16;
+ u += 16;
+ v += 16;
+ dst += 16 * 2;
+ length -= 16;
+ }
+ if (length > 8) {
+ uint8_t temp[2 * 16] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ CALC_RGB565(temp, u, v, GB, RG, 16, temp);
+#else
+ CALC_RGB565(temp, u, v, RG, GB, 16, temp);
+#endif
+ memcpy(dst, temp, length * 2 * sizeof(*dst));
+ } else if (length > 0) {
+ uint8_t temp[2 * 8] = { 0 };
+ memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ CALC_RGB565(temp, u, v, GB, RG, 8, temp);
+#else
+ CALC_RGB565(temp, u, v, RG, GB, 8, temp);
+#endif
+ memcpy(dst, temp, length * 2 * sizeof(*dst));
+ }
+}
+#endif // WEBP_REDUCE_CSP
+
+#define UPSAMPLE_32PIXELS(a, b, c, d) do { \
+ v16u8 s = __msa_aver_u_b(a, d); \
+ v16u8 t = __msa_aver_u_b(b, c); \
+ const v16u8 st = s ^ t; \
+ v16u8 ad = a ^ d; \
+ v16u8 bc = b ^ c; \
+ v16u8 t0 = ad | bc; \
+ v16u8 t1 = t0 | st; \
+ v16u8 t2 = ANDI_B(t1, 1); \
+ v16u8 t3 = __msa_aver_u_b(s, t); \
+ const v16u8 k = t3 - t2; \
+ v16u8 diag1, diag2; \
+ AVER_UB2_UB(t, k, s, k, t0, t1); \
+ bc = bc & st; \
+ ad = ad & st; \
+ t = t ^ k; \
+ s = s ^ k; \
+ t2 = bc | t; \
+ t3 = ad | s; \
+ t2 = ANDI_B(t2, 1); \
+ t3 = ANDI_B(t3, 1); \
+ SUB2(t0, t2, t1, t3, diag1, diag2); \
+ AVER_UB2_UB(a, diag1, b, diag2, t0, t1); \
+ ILVRL_B2_UB(t1, t0, a, b); \
+ if (pbot_y != NULL) { \
+ AVER_UB2_UB(c, diag2, d, diag1, t0, t1); \
+ ILVRL_B2_UB(t1, t0, c, d); \
+ } \
+} while (0)
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
+ const uint8_t* top_u, const uint8_t* top_v, \
+ const uint8_t* cur_u, const uint8_t* cur_v, \
+ uint8_t* top_dst, uint8_t* bot_dst, int len) \
+{ \
+ int size = (len - 1) >> 1; \
+ uint8_t temp_u[64]; \
+ uint8_t temp_v[64]; \
+ const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16)); \
+ const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16)); \
+ const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
+ const uint8_t* ptop_y = &top_y[1]; \
+ uint8_t* ptop_dst = top_dst + XSTEP; \
+ const uint8_t* pbot_y = &bot_y[1]; \
+ uint8_t* pbot_dst = bot_dst + XSTEP; \
+ \
+ FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
+ if (bot_y != NULL) { \
+ const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
+ FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst); \
+ } \
+ while (size >= 16) { \
+ v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \
+ LD_UB2(top_u, 1, tu0, tu1); \
+ LD_UB2(cur_u, 1, cu0, cu1); \
+ LD_UB2(top_v, 1, tv0, tv1); \
+ LD_UB2(cur_v, 1, cv0, cv1); \
+ UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \
+ UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \
+ ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \
+ ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \
+ FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32); \
+ if (bot_y != NULL) { \
+ FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32); \
+ } \
+ ptop_y += 32; \
+ pbot_y += 32; \
+ ptop_dst += XSTEP * 32; \
+ pbot_dst += XSTEP * 32; \
+ top_u += 16; \
+ top_v += 16; \
+ cur_u += 16; \
+ cur_v += 16; \
+ size -= 16; \
+ } \
+ if (size > 0) { \
+ v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \
+ memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t)); \
+ memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t)); \
+ memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t)); \
+ memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t)); \
+ LD_UB2(&temp_u[ 0], 1, tu0, tu1); \
+ LD_UB2(&temp_u[32], 1, cu0, cu1); \
+ LD_UB2(&temp_v[ 0], 1, tv0, tv1); \
+ LD_UB2(&temp_v[32], 1, cv0, cv1); \
+ UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \
+ UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \
+ ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \
+ ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \
+ FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2); \
+ if (bot_y != NULL) { \
+ FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2); \
+ } \
+ top_u += size; \
+ top_v += size; \
+ cur_u += size; \
+ cur_v += size; \
+ } \
+ if (!(len & 1)) { \
+ const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16)); \
+ const uint32_t c0 = ((cur_u[0]) | ((cur_v[0]) << 16)); \
+ const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2; \
+ FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16), \
+ top_dst + (len - 1) * XSTEP); \
+ if (bot_y != NULL) { \
+ const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2; \
+ FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16), \
+ bot_dst + (len - 1) * XSTEP); \
+ } \
+ } \
+}
+
+UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
+UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
+#endif // WEBP_REDUCE_CSP
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+extern void WebPInitUpsamplersMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
+ WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
+ WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+ WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
+ WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+ WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
+ WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
+ WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
+ WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
+ WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
+ WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+ WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif // WEBP_REDUCE_CSP
+}
+
+#endif // FANCY_UPSAMPLING
+
+#endif // WEBP_USE_MSA
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA)
+#endif
diff --git a/media/libwebp/dsp/upsampling_neon.c b/media/libwebp/dsp/upsampling_neon.c
index c847d70d46..41cb44b03f 100644
--- a/media/libwebp/dsp/upsampling_neon.c
+++ b/media/libwebp/dsp/upsampling_neon.c
@@ -58,8 +58,8 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
- uint8_t *out) {
+static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2,
+ uint8_t* out) {
UPSAMPLE_16PIXELS(r1, r2, out);
}
@@ -190,14 +190,14 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
}
#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \
-static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
- const uint8_t *top_u, const uint8_t *top_v, \
- const uint8_t *cur_u, const uint8_t *cur_v, \
- uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
+ const uint8_t* top_u, const uint8_t* top_v, \
+ const uint8_t* cur_u, const uint8_t* cur_v, \
+ uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int block; \
/* 16 byte aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[2 * 32 + 15]; \
- uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
+ uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
const int uv_len = (len + 1) >> 1; \
/* 9 pixels must be read-able for each block */ \
const int num_blocks = (uv_len - 1) >> 3; \
diff --git a/media/libwebp/dsp/yuv.c b/media/libwebp/dsp/yuv.c
index 12c04ca426..bd9db04149 100644
--- a/media/libwebp/dsp/yuv.c
+++ b/media/libwebp/dsp/yuv.c
@@ -90,16 +90,16 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitSamplersSSE2();
}
-#endif // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitSamplersSSE41();
}
-#endif // WEBP_USE_SSE41
+#endif // WEBP_HAVE_SSE41
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPInitSamplersMIPS32();
@@ -276,26 +276,26 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
#endif
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitConvertARGBToYUVSSE2();
WebPInitSharpYUVSSE2();
}
-#endif // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitConvertARGBToYUVSSE41();
}
-#endif // WEBP_USE_SSE41
+#endif // WEBP_HAVE_SSE41
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitConvertARGBToYUVNEON();
WebPInitSharpYUVNEON();
}
-#endif // WEBP_USE_NEON
+#endif // WEBP_HAVE_NEON
assert(WebPConvertARGBToY != NULL);
assert(WebPConvertARGBToUV != NULL);
diff --git a/media/libwebp/dsp/yuv.h b/media/libwebp/dsp/yuv.h
index 947b89e13c..28524ec422 100644
--- a/media/libwebp/dsp/yuv.h
+++ b/media/libwebp/dsp/yuv.h
@@ -10,7 +10,7 @@
// inline YUV<->RGB conversion function
//
// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
+// More information at: https://en.wikipedia.org/wiki/YCbCr
// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
diff --git a/media/libwebp/dsp/yuv_mips32.c b/media/libwebp/dsp/yuv_mips32.c
new file mode 100644
index 0000000000..fc7c2cda0e
--- /dev/null
+++ b/media/libwebp/dsp/yuv_mips32.c
@@ -0,0 +1,103 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of YUV to RGB upsampling functions.
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../dsp/yuv.h"
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
+static void FUNC_NAME(const uint8_t* y, \
+ const uint8_t* u, const uint8_t* v, \
+ uint8_t* dst, int len) { \
+ int i, r, g, b; \
+ int temp0, temp1, temp2, temp3, temp4; \
+ for (i = 0; i < (len >> 1); i++) { \
+ temp1 = MultHi(v[0], 26149); \
+ temp3 = MultHi(v[0], 13320); \
+ temp2 = MultHi(u[0], 6419); \
+ temp4 = MultHi(u[0], 33050); \
+ temp0 = MultHi(y[0], 19077); \
+ temp1 -= 14234; \
+ temp3 -= 8708; \
+ temp2 += temp3; \
+ temp4 -= 17685; \
+ r = VP8Clip8(temp0 + temp1); \
+ g = VP8Clip8(temp0 - temp2); \
+ b = VP8Clip8(temp0 + temp4); \
+ temp0 = MultHi(y[1], 19077); \
+ dst[R] = r; \
+ dst[G] = g; \
+ dst[B] = b; \
+ if (A) dst[A] = 0xff; \
+ r = VP8Clip8(temp0 + temp1); \
+ g = VP8Clip8(temp0 - temp2); \
+ b = VP8Clip8(temp0 + temp4); \
+ dst[R + XSTEP] = r; \
+ dst[G + XSTEP] = g; \
+ dst[B + XSTEP] = b; \
+ if (A) dst[A + XSTEP] = 0xff; \
+ y += 2; \
+ ++u; \
+ ++v; \
+ dst += 2 * XSTEP; \
+ } \
+ if (len & 1) { \
+ temp1 = MultHi(v[0], 26149); \
+ temp3 = MultHi(v[0], 13320); \
+ temp2 = MultHi(u[0], 6419); \
+ temp4 = MultHi(u[0], 33050); \
+ temp0 = MultHi(y[0], 19077); \
+ temp1 -= 14234; \
+ temp3 -= 8708; \
+ temp2 += temp3; \
+ temp4 -= 17685; \
+ r = VP8Clip8(temp0 + temp1); \
+ g = VP8Clip8(temp0 - temp2); \
+ b = VP8Clip8(temp0 + temp4); \
+ dst[R] = r; \
+ dst[G] = g; \
+ dst[B] = b; \
+ if (A) dst[A] = 0xff; \
+ } \
+}
+
+ROW_FUNC(YuvToRgbRow_MIPS32, 3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPS32, 4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPS32, 3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPS32, 4, 2, 1, 0, 3)
+
+#undef ROW_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
+ WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPS32;
+ WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
+ WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPS32;
+ WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
+}
+
+#else // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersMIPS32)
+
+#endif // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/yuv_mips_dsp_r2.c b/media/libwebp/dsp/yuv_mips_dsp_r2.c
new file mode 100644
index 0000000000..1418a9fba1
--- /dev/null
+++ b/media/libwebp/dsp/yuv_mips_dsp_r2.c
@@ -0,0 +1,134 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS DSPr2 version of YUV to RGB upsampling functions.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+// Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/yuv.h"
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define ROW_FUNC_PART_1() \
+ "lbu %[temp3], 0(%[v]) \n\t" \
+ "lbu %[temp4], 0(%[u]) \n\t" \
+ "lbu %[temp0], 0(%[y]) \n\t" \
+ "mul %[temp1], %[t_con_1], %[temp3] \n\t" \
+ "mul %[temp3], %[t_con_2], %[temp3] \n\t" \
+ "mul %[temp2], %[t_con_3], %[temp4] \n\t" \
+ "mul %[temp4], %[t_con_4], %[temp4] \n\t" \
+ "mul %[temp0], %[t_con_5], %[temp0] \n\t" \
+ "subu %[temp1], %[temp1], %[t_con_6] \n\t" \
+ "subu %[temp3], %[temp3], %[t_con_7] \n\t" \
+ "addu %[temp2], %[temp2], %[temp3] \n\t" \
+ "subu %[temp4], %[temp4], %[t_con_8] \n\t" \
+
+#define ROW_FUNC_PART_2(R, G, B, K) \
+ "addu %[temp5], %[temp0], %[temp1] \n\t" \
+ "subu %[temp6], %[temp0], %[temp2] \n\t" \
+ "addu %[temp7], %[temp0], %[temp4] \n\t" \
+".if " #K " \n\t" \
+ "lbu %[temp0], 1(%[y]) \n\t" \
+".endif \n\t" \
+ "shll_s.w %[temp5], %[temp5], 17 \n\t" \
+ "shll_s.w %[temp6], %[temp6], 17 \n\t" \
+".if " #K " \n\t" \
+ "mul %[temp0], %[t_con_5], %[temp0] \n\t" \
+".endif \n\t" \
+ "shll_s.w %[temp7], %[temp7], 17 \n\t" \
+ "precrqu_s.qb.ph %[temp5], %[temp5], $zero \n\t" \
+ "precrqu_s.qb.ph %[temp6], %[temp6], $zero \n\t" \
+ "precrqu_s.qb.ph %[temp7], %[temp7], $zero \n\t" \
+ "srl %[temp5], %[temp5], 24 \n\t" \
+ "srl %[temp6], %[temp6], 24 \n\t" \
+ "srl %[temp7], %[temp7], 24 \n\t" \
+ "sb %[temp5], " #R "(%[dst]) \n\t" \
+ "sb %[temp6], " #G "(%[dst]) \n\t" \
+ "sb %[temp7], " #B "(%[dst]) \n\t" \
+
+#define ASM_CLOBBER_LIST() \
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
+ [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
+ [temp6]"=&r"(temp6), [temp7]"=&r"(temp7) \
+ : [t_con_1]"r"(t_con_1), [t_con_2]"r"(t_con_2), [t_con_3]"r"(t_con_3), \
+ [t_con_4]"r"(t_con_4), [t_con_5]"r"(t_con_5), [t_con_6]"r"(t_con_6), \
+ [u]"r"(u), [v]"r"(v), [y]"r"(y), [dst]"r"(dst), \
+ [t_con_7]"r"(t_con_7), [t_con_8]"r"(t_con_8) \
+ : "memory", "hi", "lo" \
+
+#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
+static void FUNC_NAME(const uint8_t* y, \
+ const uint8_t* u, const uint8_t* v, \
+ uint8_t* dst, int len) { \
+ int i; \
+ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
+ const int t_con_1 = 26149; \
+ const int t_con_2 = 13320; \
+ const int t_con_3 = 6419; \
+ const int t_con_4 = 33050; \
+ const int t_con_5 = 19077; \
+ const int t_con_6 = 14234; \
+ const int t_con_7 = 8708; \
+ const int t_con_8 = 17685; \
+ for (i = 0; i < (len >> 1); i++) { \
+ __asm__ volatile ( \
+ ROW_FUNC_PART_1() \
+ ROW_FUNC_PART_2(R, G, B, 1) \
+ ROW_FUNC_PART_2(R + XSTEP, G + XSTEP, B + XSTEP, 0) \
+ ASM_CLOBBER_LIST() \
+ ); \
+ if (A) dst[A] = dst[A + XSTEP] = 0xff; \
+ y += 2; \
+ ++u; \
+ ++v; \
+ dst += 2 * XSTEP; \
+ } \
+ if (len & 1) { \
+ __asm__ volatile ( \
+ ROW_FUNC_PART_1() \
+ ROW_FUNC_PART_2(R, G, B, 0) \
+ ASM_CLOBBER_LIST() \
+ ); \
+ if (A) dst[A] = 0xff; \
+ } \
+}
+
+ROW_FUNC(YuvToRgbRow_MIPSdspR2, 3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPSdspR2, 4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPSdspR2, 3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPSdspR2, 4, 2, 1, 0, 3)
+
+#undef ROW_FUNC
+#undef ASM_CLOBBER_LIST
+#undef ROW_FUNC_PART_2
+#undef ROW_FUNC_PART_1
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
+ WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPSdspR2;
+ WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
+ WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPSdspR2;
+ WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
+}
+
+#else // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersMIPSdspR2)
+
+#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/enc/alpha_enc.c b/media/libwebp/enc/alpha_enc.c
new file mode 100644
index 0000000000..edfe95ec94
--- /dev/null
+++ b/media/libwebp/enc/alpha_enc.c
@@ -0,0 +1,443 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha-plane compression.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+#include "../utils/filters_utils.h"
+#include "../utils/quant_levels_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// -----------------------------------------------------------------------------
+// Encodes the given alpha data via specified compression method 'method'.
+// The pre-processing (quantization) is performed if 'quality' is less than 100.
+// For such cases, the encoding is lossy. The valid range is [0, 100] for
+// 'quality' and [0, 1] for 'method':
+// 'method = 0' - No compression;
+// 'method = 1' - Use lossless coder on the alpha plane only
+// 'filter' values [0, 4] correspond to prediction modes none, horizontal,
+// vertical & gradient filters. The prediction mode 4 will try all the
+// prediction modes 0 to 3 and pick the best one.
+// 'effort_level': specifies how much effort must be spent to try and reduce
+// the compressed output size. In range 0 (quick) to 6 (slow).
+//
+// 'output' corresponds to the buffer containing compressed alpha data.
+// This buffer is allocated by this method and caller should call
+// WebPSafeFree(*output) when done.
+// 'output_size' corresponds to size of this compressed alpha buffer.
+//
+// Returns 1 on successfully encoding the alpha and
+// 0 if either:
+// invalid quality or method, or
+// memory allocation for the compressed data fails.
+
+#include "../enc/vp8li_enc.h"
+
+static int EncodeLossless(const uint8_t* const data, int width, int height,
+ int effort_level, // in [0..6] range
+ int use_quality_100, VP8LBitWriter* const bw,
+ WebPAuxStats* const stats) {
+ int ok = 0;
+ WebPConfig config;
+ WebPPicture picture;
+
+ WebPPictureInit(&picture);
+ picture.width = width;
+ picture.height = height;
+ picture.use_argb = 1;
+ picture.stats = stats;
+ if (!WebPPictureAlloc(&picture)) return 0;
+
+ // Transfer the alpha values to the green channel.
+ WebPDispatchAlphaToGreen(data, width, picture.width, picture.height,
+ picture.argb, picture.argb_stride);
+
+ WebPConfigInit(&config);
+ config.lossless = 1;
+ // Enable exact, or it would alter RGB values of transparent alpha, which is
+ // normally OK but not here since we are not encoding the input image but an
+ // internal encoding-related image containing necessary exact information in
+ // RGB channels.
+ config.exact = 1;
+ config.method = effort_level; // impact is very small
+ // Set a low default quality for encoding alpha. Ensure that Alpha quality at
+ // lower methods (3 and below) is less than the threshold for triggering
+ // costly 'BackwardReferencesTraceBackwards'.
+ // If the alpha quality is set to 100 and the method to 6, allow for a high
+ // lossless quality to trigger the cruncher.
+ config.quality =
+ (use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
+ assert(config.quality >= 0 && config.quality <= 100.f);
+
+ // TODO(urvang): Temporary fix to avoid generating images that trigger
+ // a decoder bug related to alpha with color cache.
+ // See: https://code.google.com/p/webp/issues/detail?id=239
+ // Need to re-enable this later.
+ ok = (VP8LEncodeStream(&config, &picture, bw, 0 /*use_cache*/) == VP8_ENC_OK);
+ WebPPictureFree(&picture);
+ ok = ok && !bw->error_;
+ if (!ok) {
+ VP8LBitWriterWipeOut(bw);
+ return 0;
+ }
+ return 1;
+}
+
+// -----------------------------------------------------------------------------
+
+// Small struct to hold the result of a filter mode compression attempt.
+typedef struct {
+ size_t score;
+ VP8BitWriter bw;
+ WebPAuxStats stats;
+} FilterTrial;
+
+// This function always returns an initialized 'bw' object, even upon error.
+static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
+ int method, int filter, int reduce_levels,
+ int effort_level, // in [0..6] range
+ uint8_t* const tmp_alpha,
+ FilterTrial* result) {
+ int ok = 0;
+ const uint8_t* alpha_src;
+ WebPFilterFunc filter_func;
+ uint8_t header;
+ const size_t data_size = width * height;
+ const uint8_t* output = NULL;
+ size_t output_size = 0;
+ VP8LBitWriter tmp_bw;
+
+ assert((uint64_t)data_size == (uint64_t)width * height); // as per spec
+ assert(filter >= 0 && filter < WEBP_FILTER_LAST);
+ assert(method >= ALPHA_NO_COMPRESSION);
+ assert(method <= ALPHA_LOSSLESS_COMPRESSION);
+ assert(sizeof(header) == ALPHA_HEADER_LEN);
+
+ filter_func = WebPFilters[filter];
+ if (filter_func != NULL) {
+ filter_func(data, width, height, width, tmp_alpha);
+ alpha_src = tmp_alpha;
+ } else {
+ alpha_src = data;
+ }
+
+ if (method != ALPHA_NO_COMPRESSION) {
+ ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
+ ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
+ !reduce_levels, &tmp_bw, &result->stats);
+ if (ok) {
+ output = VP8LBitWriterFinish(&tmp_bw);
+ output_size = VP8LBitWriterNumBytes(&tmp_bw);
+ if (output_size > data_size) {
+ // compressed size is larger than source! Revert to uncompressed mode.
+ method = ALPHA_NO_COMPRESSION;
+ VP8LBitWriterWipeOut(&tmp_bw);
+ }
+ } else {
+ VP8LBitWriterWipeOut(&tmp_bw);
+ return 0;
+ }
+ }
+
+ if (method == ALPHA_NO_COMPRESSION) {
+ output = alpha_src;
+ output_size = data_size;
+ ok = 1;
+ }
+
+ // Emit final result.
+ header = method | (filter << 2);
+ if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
+
+ VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
+ ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+ ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
+
+ if (method != ALPHA_NO_COMPRESSION) {
+ VP8LBitWriterWipeOut(&tmp_bw);
+ }
+ ok = ok && !result->bw.error_;
+ result->score = VP8BitWriterSize(&result->bw);
+ return ok;
+}
+
+// -----------------------------------------------------------------------------
+
+static int GetNumColors(const uint8_t* data, int width, int height,
+ int stride) {
+ int j;
+ int colors = 0;
+ uint8_t color[256] = { 0 };
+
+ for (j = 0; j < height; ++j) {
+ int i;
+ const uint8_t* const p = data + j * stride;
+ for (i = 0; i < width; ++i) {
+ color[p[i]] = 1;
+ }
+ }
+ for (j = 0; j < 256; ++j) {
+ if (color[j] > 0) ++colors;
+ }
+ return colors;
+}
+
+#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
+#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
+
+// Given the input 'filter' option, return an OR'd bit-set of filters to try.
+static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
+ int filter, int effort_level) {
+ uint32_t bit_map = 0U;
+ if (filter == WEBP_FILTER_FAST) {
+ // Quick estimate of the best candidate.
+ int try_filter_none = (effort_level > 3);
+ const int kMinColorsForFilterNone = 16;
+ const int kMaxColorsForFilterNone = 192;
+ const int num_colors = GetNumColors(alpha, width, height, width);
+ // For low number of colors, NONE yields better compression.
+ filter = (num_colors <= kMinColorsForFilterNone)
+ ? WEBP_FILTER_NONE
+ : WebPEstimateBestFilter(alpha, width, height, width);
+ bit_map |= 1 << filter;
+ // For large number of colors, try FILTER_NONE in addition to the best
+ // filter as well.
+ if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
+ bit_map |= FILTER_TRY_NONE;
+ }
+ } else if (filter == WEBP_FILTER_NONE) {
+ bit_map = FILTER_TRY_NONE;
+ } else { // WEBP_FILTER_BEST -> try all
+ bit_map = FILTER_TRY_ALL;
+ }
+ return bit_map;
+}
+
+static void InitFilterTrial(FilterTrial* const score) {
+ score->score = (size_t)~0U;
+ VP8BitWriterInit(&score->bw, 0);
+}
+
+static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
+ size_t data_size, int method, int filter,
+ int reduce_levels, int effort_level,
+ uint8_t** const output,
+ size_t* const output_size,
+ WebPAuxStats* const stats) {
+ int ok = 1;
+ FilterTrial best;
+ uint32_t try_map =
+ GetFilterMap(alpha, width, height, filter, effort_level);
+ InitFilterTrial(&best);
+
+ if (try_map != FILTER_TRY_NONE) {
+ uint8_t* filtered_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
+ if (filtered_alpha == NULL) return 0;
+
+ for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
+ if (try_map & 1) {
+ FilterTrial trial;
+ ok = EncodeAlphaInternal(alpha, width, height, method, filter,
+ reduce_levels, effort_level, filtered_alpha,
+ &trial);
+ if (ok && trial.score < best.score) {
+ VP8BitWriterWipeOut(&best.bw);
+ best = trial;
+ } else {
+ VP8BitWriterWipeOut(&trial.bw);
+ }
+ }
+ }
+ WebPSafeFree(filtered_alpha);
+ } else {
+ ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
+ reduce_levels, effort_level, NULL, &best);
+ }
+ if (ok) {
+#if !defined(WEBP_DISABLE_STATS)
+ if (stats != NULL) {
+ stats->lossless_features = best.stats.lossless_features;
+ stats->histogram_bits = best.stats.histogram_bits;
+ stats->transform_bits = best.stats.transform_bits;
+ stats->cache_bits = best.stats.cache_bits;
+ stats->palette_size = best.stats.palette_size;
+ stats->lossless_size = best.stats.lossless_size;
+ stats->lossless_hdr_size = best.stats.lossless_hdr_size;
+ stats->lossless_data_size = best.stats.lossless_data_size;
+ }
+#else
+ (void)stats;
+#endif
+ *output_size = VP8BitWriterSize(&best.bw);
+ *output = VP8BitWriterBuf(&best.bw);
+ } else {
+ VP8BitWriterWipeOut(&best.bw);
+ }
+ return ok;
+}
+
+static int EncodeAlpha(VP8Encoder* const enc,
+ int quality, int method, int filter,
+ int effort_level,
+ uint8_t** const output, size_t* const output_size) {
+ const WebPPicture* const pic = enc->pic_;
+ const int width = pic->width;
+ const int height = pic->height;
+
+ uint8_t* quant_alpha = NULL;
+ const size_t data_size = width * height;
+ uint64_t sse = 0;
+ int ok = 1;
+ const int reduce_levels = (quality < 100);
+
+ // quick correctness checks
+ assert((uint64_t)data_size == (uint64_t)width * height); // as per spec
+ assert(enc != NULL && pic != NULL && pic->a != NULL);
+ assert(output != NULL && output_size != NULL);
+ assert(width > 0 && height > 0);
+ assert(pic->a_stride >= width);
+ assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);
+
+ if (quality < 0 || quality > 100) {
+ return 0;
+ }
+
+ if (method < ALPHA_NO_COMPRESSION || method > ALPHA_LOSSLESS_COMPRESSION) {
+ return 0;
+ }
+
+ if (method == ALPHA_NO_COMPRESSION) {
+ // Don't filter, as filtering will make no impact on compressed size.
+ filter = WEBP_FILTER_NONE;
+ }
+
+ quant_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
+ if (quant_alpha == NULL) {
+ return 0;
+ }
+
+ // Extract alpha data (width x height) from raw_data (stride x height).
+ WebPCopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
+
+ if (reduce_levels) { // No Quantization required for 'quality = 100'.
+ // 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
+ // mapped to moderate quality 70. Hence Quality:[0, 70] -> Levels:[2, 16]
+ // and Quality:]70, 100] -> Levels:]16, 256].
+ const int alpha_levels = (quality <= 70) ? (2 + quality / 5)
+ : (16 + (quality - 70) * 8);
+ ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &sse);
+ }
+
+ if (ok) {
+ VP8FiltersInit();
+ ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
+ filter, reduce_levels, effort_level, output,
+ output_size, pic->stats);
+#if !defined(WEBP_DISABLE_STATS)
+ if (pic->stats != NULL) { // need stats?
+ pic->stats->coded_size += (int)(*output_size);
+ enc->sse_[3] = sse;
+ }
+#endif
+ }
+
+ WebPSafeFree(quant_alpha);
+ return ok;
+}
+
+//------------------------------------------------------------------------------
+// Main calls
+
+static int CompressAlphaJob(void* arg1, void* unused) {
+ VP8Encoder* const enc = (VP8Encoder*)arg1;
+ const WebPConfig* config = enc->config_;
+ uint8_t* alpha_data = NULL;
+ size_t alpha_size = 0;
+ const int effort_level = config->method; // maps to [0..6]
+ const WEBP_FILTER_TYPE filter =
+ (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
+ (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
+ WEBP_FILTER_BEST;
+ if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
+ filter, effort_level, &alpha_data, &alpha_size)) {
+ return 0;
+ }
+ if (alpha_size != (uint32_t)alpha_size) { // Soundness check.
+ WebPSafeFree(alpha_data);
+ return 0;
+ }
+ enc->alpha_data_size_ = (uint32_t)alpha_size;
+ enc->alpha_data_ = alpha_data;
+ (void)unused;
+ return 1;
+}
+
+void VP8EncInitAlpha(VP8Encoder* const enc) {
+ WebPInitAlphaProcessing();
+ enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
+ enc->alpha_data_ = NULL;
+ enc->alpha_data_size_ = 0;
+ if (enc->thread_level_ > 0) {
+ WebPWorker* const worker = &enc->alpha_worker_;
+ WebPGetWorkerInterface()->Init(worker);
+ worker->data1 = enc;
+ worker->data2 = NULL;
+ worker->hook = CompressAlphaJob;
+ }
+}
+
+int VP8EncStartAlpha(VP8Encoder* const enc) {
+ if (enc->has_alpha_) {
+ if (enc->thread_level_ > 0) {
+ WebPWorker* const worker = &enc->alpha_worker_;
+ // Makes sure worker is good to go.
+ if (!WebPGetWorkerInterface()->Reset(worker)) {
+ return 0;
+ }
+ WebPGetWorkerInterface()->Launch(worker);
+ return 1;
+ } else {
+ return CompressAlphaJob(enc, NULL); // just do the job right away
+ }
+ }
+ return 1;
+}
+
+int VP8EncFinishAlpha(VP8Encoder* const enc) {
+ if (enc->has_alpha_) {
+ if (enc->thread_level_ > 0) {
+ WebPWorker* const worker = &enc->alpha_worker_;
+ if (!WebPGetWorkerInterface()->Sync(worker)) return 0; // error
+ }
+ }
+ return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+}
+
+int VP8EncDeleteAlpha(VP8Encoder* const enc) {
+ int ok = 1;
+ if (enc->thread_level_ > 0) {
+ WebPWorker* const worker = &enc->alpha_worker_;
+ // finish anything left in flight
+ ok = WebPGetWorkerInterface()->Sync(worker);
+ // still need to end the worker, even if !ok
+ WebPGetWorkerInterface()->End(worker);
+ }
+ WebPSafeFree(enc->alpha_data_);
+ enc->alpha_data_ = NULL;
+ enc->alpha_data_size_ = 0;
+ enc->has_alpha_ = 0;
+ return ok;
+}
diff --git a/media/libwebp/enc/analysis_enc.c b/media/libwebp/enc/analysis_enc.c
new file mode 100644
index 0000000000..489434b3d1
--- /dev/null
+++ b/media/libwebp/enc/analysis_enc.c
@@ -0,0 +1,475 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Macroblock analysis
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
+#include "../utils/utils.h"
+
+#define MAX_ITERS_K_MEANS 6
+
+//------------------------------------------------------------------------------
+// Smooth the segment map by replacing isolated block by the majority of its
+// neighbours.
+
+static void SmoothSegmentMap(VP8Encoder* const enc) {
+ int n, x, y;
+ const int w = enc->mb_w_;
+ const int h = enc->mb_h_;
+ const int majority_cnt_3_x_3_grid = 5;
+ uint8_t* const tmp = (uint8_t*)WebPSafeMalloc(w * h, sizeof(*tmp));
+ assert((uint64_t)(w * h) == (uint64_t)w * h); // no overflow, as per spec
+
+ if (tmp == NULL) return;
+ for (y = 1; y < h - 1; ++y) {
+ for (x = 1; x < w - 1; ++x) {
+ int cnt[NUM_MB_SEGMENTS] = { 0 };
+ const VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
+ int majority_seg = mb->segment_;
+ // Check the 8 neighbouring segment values.
+ cnt[mb[-w - 1].segment_]++; // top-left
+ cnt[mb[-w + 0].segment_]++; // top
+ cnt[mb[-w + 1].segment_]++; // top-right
+ cnt[mb[ - 1].segment_]++; // left
+ cnt[mb[ + 1].segment_]++; // right
+ cnt[mb[ w - 1].segment_]++; // bottom-left
+ cnt[mb[ w + 0].segment_]++; // bottom
+ cnt[mb[ w + 1].segment_]++; // bottom-right
+ for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+ if (cnt[n] >= majority_cnt_3_x_3_grid) {
+ majority_seg = n;
+ break;
+ }
+ }
+ tmp[x + y * w] = majority_seg;
+ }
+ }
+ for (y = 1; y < h - 1; ++y) {
+ for (x = 1; x < w - 1; ++x) {
+ VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
+ mb->segment_ = tmp[x + y * w];
+ }
+ }
+ WebPSafeFree(tmp);
+}
+
+//------------------------------------------------------------------------------
+// set segment susceptibility alpha_ / beta_
+
+static WEBP_INLINE int clip(int v, int m, int M) {
+ return (v < m) ? m : (v > M) ? M : v;
+}
+
+static void SetSegmentAlphas(VP8Encoder* const enc,
+ const int centers[NUM_MB_SEGMENTS],
+ int mid) {
+ const int nb = enc->segment_hdr_.num_segments_;
+ int min = centers[0], max = centers[0];
+ int n;
+
+ if (nb > 1) {
+ for (n = 0; n < nb; ++n) {
+ if (min > centers[n]) min = centers[n];
+ if (max < centers[n]) max = centers[n];
+ }
+ }
+ if (max == min) max = min + 1;
+ assert(mid <= max && mid >= min);
+ for (n = 0; n < nb; ++n) {
+ const int alpha = 255 * (centers[n] - mid) / (max - min);
+ const int beta = 255 * (centers[n] - min) / (max - min);
+ enc->dqm_[n].alpha_ = clip(alpha, -127, 127);
+ enc->dqm_[n].beta_ = clip(beta, 0, 255);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+#define MAX_ALPHA 255 // 8b of precision for susceptibilities.
+#define ALPHA_SCALE (2 * MAX_ALPHA) // scaling factor for alpha.
+#define DEFAULT_ALPHA (-1)
+#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
+
+static int FinalAlphaValue(int alpha) {
+ alpha = MAX_ALPHA - alpha;
+ return clip(alpha, 0, MAX_ALPHA);
+}
+
+static int GetAlpha(const VP8Histogram* const histo) {
+ // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
+ // values which happen to be mostly noise. This leaves the maximum precision
+ // for handling the useful small values which contribute most.
+ const int max_value = histo->max_value;
+ const int last_non_zero = histo->last_non_zero;
+ const int alpha =
+ (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
+ return alpha;
+}
+
+static void InitHistogram(VP8Histogram* const histo) {
+ histo->max_value = 0;
+ histo->last_non_zero = 1;
+}
+
+//------------------------------------------------------------------------------
+// Simplified k-Means, to assign Nb segments based on alpha-histogram
+
+static void AssignSegments(VP8Encoder* const enc,
+ const int alphas[MAX_ALPHA + 1]) {
+ // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+ // explicit check is needed to avoid spurious warning about 'n + 1' exceeding
+ // array bounds of 'centers' with some compilers (noticed with gcc-4.9).
+ const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
+ enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
+ int centers[NUM_MB_SEGMENTS];
+ int weighted_average = 0;
+ int map[MAX_ALPHA + 1];
+ int a, n, k;
+ int min_a = 0, max_a = MAX_ALPHA, range_a;
+ // 'int' type is ok for histo, and won't overflow
+ int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
+
+ assert(nb >= 1);
+ assert(nb <= NUM_MB_SEGMENTS);
+
+ // bracket the input
+ for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
+ min_a = n;
+ for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
+ max_a = n;
+ range_a = max_a - min_a;
+
+ // Spread initial centers evenly
+ for (k = 0, n = 1; k < nb; ++k, n += 2) {
+ assert(n < 2 * nb);
+ centers[k] = min_a + (n * range_a) / (2 * nb);
+ }
+
+ for (k = 0; k < MAX_ITERS_K_MEANS; ++k) { // few iters are enough
+ int total_weight;
+ int displaced;
+ // Reset stats
+ for (n = 0; n < nb; ++n) {
+ accum[n] = 0;
+ dist_accum[n] = 0;
+ }
+ // Assign nearest center for each 'a'
+ n = 0; // track the nearest center for current 'a'
+ for (a = min_a; a <= max_a; ++a) {
+ if (alphas[a]) {
+ while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
+ n++;
+ }
+ map[a] = n;
+ // accumulate contribution into best centroid
+ dist_accum[n] += a * alphas[a];
+ accum[n] += alphas[a];
+ }
+ }
+ // All point are classified. Move the centroids to the
+ // center of their respective cloud.
+ displaced = 0;
+ weighted_average = 0;
+ total_weight = 0;
+ for (n = 0; n < nb; ++n) {
+ if (accum[n]) {
+ const int new_center = (dist_accum[n] + accum[n] / 2) / accum[n];
+ displaced += abs(centers[n] - new_center);
+ centers[n] = new_center;
+ weighted_average += new_center * accum[n];
+ total_weight += accum[n];
+ }
+ }
+ weighted_average = (weighted_average + total_weight / 2) / total_weight;
+ if (displaced < 5) break; // no need to keep on looping...
+ }
+
+ // Map each original value to the closest centroid
+ for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+ VP8MBInfo* const mb = &enc->mb_info_[n];
+ const int alpha = mb->alpha_;
+ mb->segment_ = map[alpha];
+ mb->alpha_ = centers[map[alpha]]; // for the record.
+ }
+
+ if (nb > 1) {
+ const int smooth = (enc->config_->preprocessing & 1);
+ if (smooth) SmoothSegmentMap(enc);
+ }
+
+ SetSegmentAlphas(enc, centers, weighted_average); // pick some alphas.
+}
+
+//------------------------------------------------------------------------------
+// Macroblock analysis: collect histogram for each mode, deduce the maximal
+// susceptibility and set best modes for this macroblock.
+// Segment assignment is done later.
+
+// Number of modes to inspect for alpha_ evaluation. We don't need to test all
+// the possible modes during the analysis phase: we risk falling into a local
+// optimum, or be subject to boundary effect
+#define MAX_INTRA16_MODE 2
+#define MAX_INTRA4_MODE 2
+#define MAX_UV_MODE 2
+
+static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
+ const int max_mode = MAX_INTRA16_MODE;
+ int mode;
+ int best_alpha = DEFAULT_ALPHA;
+ int best_mode = 0;
+
+ VP8MakeLuma16Preds(it);
+ for (mode = 0; mode < max_mode; ++mode) {
+ VP8Histogram histo;
+ int alpha;
+
+ InitHistogram(&histo);
+ VP8CollectHistogram(it->yuv_in_ + Y_OFF_ENC,
+ it->yuv_p_ + VP8I16ModeOffsets[mode],
+ 0, 16, &histo);
+ alpha = GetAlpha(&histo);
+ if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+ best_alpha = alpha;
+ best_mode = mode;
+ }
+ }
+ VP8SetIntra16Mode(it, best_mode);
+ return best_alpha;
+}
+
+static int FastMBAnalyze(VP8EncIterator* const it) {
+ // Empirical cut-off value, should be around 16 (~=block size). We use the
+ // [8-17] range and favor intra4 at high quality, intra16 for low quality.
+ const int q = (int)it->enc_->config_->quality;
+ const uint32_t kThreshold = 8 + (17 - 8) * q / 100;
+ int k;
+ uint32_t dc[16], m, m2;
+ for (k = 0; k < 16; k += 4) {
+ VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]);
+ }
+ for (m = 0, m2 = 0, k = 0; k < 16; ++k) {
+ m += dc[k];
+ m2 += dc[k] * dc[k];
+ }
+ if (kThreshold * m2 < m * m) {
+ VP8SetIntra16Mode(it, 0); // DC16
+ } else {
+ const uint8_t modes[16] = { 0 }; // DC4
+ VP8SetIntra4Mode(it, modes);
+ }
+ return 0;
+}
+
+static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
+ int best_alpha = DEFAULT_ALPHA;
+ int smallest_alpha = 0;
+ int best_mode = 0;
+ const int max_mode = MAX_UV_MODE;
+ int mode;
+
+ VP8MakeChroma8Preds(it);
+ for (mode = 0; mode < max_mode; ++mode) {
+ VP8Histogram histo;
+ int alpha;
+ InitHistogram(&histo);
+ VP8CollectHistogram(it->yuv_in_ + U_OFF_ENC,
+ it->yuv_p_ + VP8UVModeOffsets[mode],
+ 16, 16 + 4 + 4, &histo);
+ alpha = GetAlpha(&histo);
+ if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+ best_alpha = alpha;
+ }
+ // The best prediction mode tends to be the one with the smallest alpha.
+ if (mode == 0 || alpha < smallest_alpha) {
+ smallest_alpha = alpha;
+ best_mode = mode;
+ }
+ }
+ VP8SetIntraUVMode(it, best_mode);
+ return best_alpha;
+}
+
+static void MBAnalyze(VP8EncIterator* const it,
+ int alphas[MAX_ALPHA + 1],
+ int* const alpha, int* const uv_alpha) {
+ const VP8Encoder* const enc = it->enc_;
+ int best_alpha, best_uv_alpha;
+
+ VP8SetIntra16Mode(it, 0); // default: Intra16, DC_PRED
+ VP8SetSkip(it, 0); // not skipped
+ VP8SetSegment(it, 0); // default segment, spec-wise.
+
+ if (enc->method_ <= 1) {
+ best_alpha = FastMBAnalyze(it);
+ } else {
+ best_alpha = MBAnalyzeBestIntra16Mode(it);
+ }
+ best_uv_alpha = MBAnalyzeBestUVMode(it);
+
+ // Final susceptibility mix
+ best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
+ best_alpha = FinalAlphaValue(best_alpha);
+ alphas[best_alpha]++;
+ it->mb_->alpha_ = best_alpha; // for later remapping.
+
+ // Accumulate for later complexity analysis.
+ *alpha += best_alpha; // mixed susceptibility (not just luma)
+ *uv_alpha += best_uv_alpha;
+}
+
+static void DefaultMBInfo(VP8MBInfo* const mb) {
+ mb->type_ = 1; // I16x16
+ mb->uv_mode_ = 0;
+ mb->skip_ = 0; // not skipped
+ mb->segment_ = 0; // default segment
+ mb->alpha_ = 0;
+}
+
+//------------------------------------------------------------------------------
+// Main analysis loop:
+// Collect all susceptibilities for each macroblock and record their
+// distribution in alphas[]. Segments is assigned a-posteriori, based on
+// this histogram.
+// We also pick an intra16 prediction mode, which shouldn't be considered
+// final except for fast-encode settings. We can also pick some intra4 modes
+// and decide intra4/intra16, but that's usually almost always a bad choice at
+// this stage.
+
+static void ResetAllMBInfo(VP8Encoder* const enc) {
+ int n;
+ for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+ DefaultMBInfo(&enc->mb_info_[n]);
+ }
+ // Default susceptibilities.
+ enc->dqm_[0].alpha_ = 0;
+ enc->dqm_[0].beta_ = 0;
+ // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
+ enc->alpha_ = 0;
+ enc->uv_alpha_ = 0;
+ WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+}
+
+// struct used to collect job result
+typedef struct {
+ WebPWorker worker;
+ int alphas[MAX_ALPHA + 1];
+ int alpha, uv_alpha;
+ VP8EncIterator it;
+ int delta_progress;
+} SegmentJob;
+
+// main work call
+static int DoSegmentsJob(void* arg1, void* arg2) {
+ SegmentJob* const job = (SegmentJob*)arg1;
+ VP8EncIterator* const it = (VP8EncIterator*)arg2;
+ int ok = 1;
+ if (!VP8IteratorIsDone(it)) {
+ uint8_t tmp[32 + WEBP_ALIGN_CST];
+ uint8_t* const scratch = (uint8_t*)WEBP_ALIGN(tmp);
+ do {
+ // Let's pretend we have perfect lossless reconstruction.
+ VP8IteratorImport(it, scratch);
+ MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
+ ok = VP8IteratorProgress(it, job->delta_progress);
+ } while (ok && VP8IteratorNext(it));
+ }
+ return ok;
+}
+
+static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
+ int i;
+ for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
+ dst->alpha += src->alpha;
+ dst->uv_alpha += src->uv_alpha;
+}
+
+// initialize the job struct with some tasks to perform
+static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
+ int start_row, int end_row) {
+ WebPGetWorkerInterface()->Init(&job->worker);
+ job->worker.data1 = job;
+ job->worker.data2 = &job->it;
+ job->worker.hook = DoSegmentsJob;
+ VP8IteratorInit(enc, &job->it);
+ VP8IteratorSetRow(&job->it, start_row);
+ VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
+ memset(job->alphas, 0, sizeof(job->alphas));
+ job->alpha = 0;
+ job->uv_alpha = 0;
+ // only one of both jobs can record the progress, since we don't
+ // expect the user's hook to be multi-thread safe
+ job->delta_progress = (start_row == 0) ? 20 : 0;
+}
+
+// main entry point
+int VP8EncAnalyze(VP8Encoder* const enc) {
+ int ok = 1;
+ const int do_segments =
+ enc->config_->emulate_jpeg_size || // We need the complexity evaluation.
+ (enc->segment_hdr_.num_segments_ > 1) ||
+ (enc->method_ <= 1); // for method 0 - 1, we need preds_[] to be filled.
+ if (do_segments) {
+ const int last_row = enc->mb_h_;
+ // We give a little more than a half work to the main thread.
+ const int split_row = (9 * last_row + 15) >> 4;
+ const int total_mb = last_row * enc->mb_w_;
+#ifdef WEBP_USE_THREAD
+ const int kMinSplitRow = 2; // minimal rows needed for mt to be worth it
+ const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
+#else
+ const int do_mt = 0;
+#endif
+ const WebPWorkerInterface* const worker_interface =
+ WebPGetWorkerInterface();
+ SegmentJob main_job;
+ if (do_mt) {
+ SegmentJob side_job;
+ // Note the use of '&' instead of '&&' because we must call the functions
+ // no matter what.
+ InitSegmentJob(enc, &main_job, 0, split_row);
+ InitSegmentJob(enc, &side_job, split_row, last_row);
+ // we don't need to call Reset() on main_job.worker, since we're calling
+ // WebPWorkerExecute() on it
+ ok &= worker_interface->Reset(&side_job.worker);
+ // launch the two jobs in parallel
+ if (ok) {
+ worker_interface->Launch(&side_job.worker);
+ worker_interface->Execute(&main_job.worker);
+ ok &= worker_interface->Sync(&side_job.worker);
+ ok &= worker_interface->Sync(&main_job.worker);
+ }
+ worker_interface->End(&side_job.worker);
+ if (ok) MergeJobs(&side_job, &main_job); // merge results together
+ } else {
+ // Even for single-thread case, we use the generic Worker tools.
+ InitSegmentJob(enc, &main_job, 0, last_row);
+ worker_interface->Execute(&main_job.worker);
+ ok &= worker_interface->Sync(&main_job.worker);
+ }
+ worker_interface->End(&main_job.worker);
+ if (ok) {
+ enc->alpha_ = main_job.alpha / total_mb;
+ enc->uv_alpha_ = main_job.uv_alpha / total_mb;
+ AssignSegments(enc, main_job.alphas);
+ }
+ } else { // Use only one default segment.
+ ResetAllMBInfo(enc);
+ }
+ return ok;
+}
+
diff --git a/media/libwebp/enc/backward_references_cost_enc.c b/media/libwebp/enc/backward_references_cost_enc.c
new file mode 100644
index 0000000000..59e2c0f611
--- /dev/null
+++ b/media/libwebp/enc/backward_references_cost_enc.c
@@ -0,0 +1,790 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Improves a given set of backward references by analyzing its bit cost.
+// The algorithm is similar to the Zopfli compression algorithm but tailored to
+// images.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+//
+
+#include <assert.h>
+
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/utils.h"
+
+#define VALUES_IN_BYTE 256
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+ const PixOrCopy v);
+
+typedef struct {
+ double alpha_[VALUES_IN_BYTE];
+ double red_[VALUES_IN_BYTE];
+ double blue_[VALUES_IN_BYTE];
+ double distance_[NUM_DISTANCE_CODES];
+ double* literal_;
+} CostModel;
+
+static void ConvertPopulationCountTableToBitEstimates(
+ int num_symbols, const uint32_t population_counts[], double output[]) {
+ uint32_t sum = 0;
+ int nonzeros = 0;
+ int i;
+ for (i = 0; i < num_symbols; ++i) {
+ sum += population_counts[i];
+ if (population_counts[i] > 0) {
+ ++nonzeros;
+ }
+ }
+ if (nonzeros <= 1) {
+ memset(output, 0, num_symbols * sizeof(*output));
+ } else {
+ const double logsum = VP8LFastLog2(sum);
+ for (i = 0; i < num_symbols; ++i) {
+ output[i] = logsum - VP8LFastLog2(population_counts[i]);
+ }
+ }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
+ const VP8LBackwardRefs* const refs) {
+ int ok = 0;
+ VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+ VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
+ if (histo == NULL) goto Error;
+
+ // The following code is similar to VP8LHistogramCreate but converts the
+ // distance to plane code.
+ VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 1);
+ while (VP8LRefsCursorOk(&c)) {
+ VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
+ xsize);
+ VP8LRefsCursorNext(&c);
+ }
+
+ ConvertPopulationCountTableToBitEstimates(
+ VP8LHistogramNumCodes(histo->palette_code_bits_),
+ histo->literal_, m->literal_);
+ ConvertPopulationCountTableToBitEstimates(
+ VALUES_IN_BYTE, histo->red_, m->red_);
+ ConvertPopulationCountTableToBitEstimates(
+ VALUES_IN_BYTE, histo->blue_, m->blue_);
+ ConvertPopulationCountTableToBitEstimates(
+ VALUES_IN_BYTE, histo->alpha_, m->alpha_);
+ ConvertPopulationCountTableToBitEstimates(
+ NUM_DISTANCE_CODES, histo->distance_, m->distance_);
+ ok = 1;
+
+ Error:
+ VP8LFreeHistogram(histo);
+ return ok;
+}
+
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+ return m->alpha_[v >> 24] +
+ m->red_[(v >> 16) & 0xff] +
+ m->literal_[(v >> 8) & 0xff] +
+ m->blue_[v & 0xff];
+}
+
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+ const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
+ return m->literal_[literal_idx];
+}
+
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
+ uint32_t length) {
+ int code, extra_bits;
+ VP8LPrefixEncodeBits(length, &code, &extra_bits);
+ return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+}
+
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
+ uint32_t distance) {
+ int code, extra_bits;
+ VP8LPrefixEncodeBits(distance, &code, &extra_bits);
+ return m->distance_[code] + extra_bits;
+}
+
+static WEBP_INLINE void AddSingleLiteralWithCostModel(
+ const uint32_t* const argb, VP8LColorCache* const hashers,
+ const CostModel* const cost_model, int idx, int use_color_cache,
+ float prev_cost, float* const cost, uint16_t* const dist_array) {
+ double cost_val = prev_cost;
+ const uint32_t color = argb[idx];
+ const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
+ if (ix >= 0) {
+ // use_color_cache is true and hashers contains color
+ const double mul0 = 0.68;
+ cost_val += GetCacheCost(cost_model, ix) * mul0;
+ } else {
+ const double mul1 = 0.82;
+ if (use_color_cache) VP8LColorCacheInsert(hashers, color);
+ cost_val += GetLiteralCost(cost_model, color) * mul1;
+ }
+ if (cost[idx] > cost_val) {
+ cost[idx] = (float)cost_val;
+ dist_array[idx] = 1; // only one is inserted.
+ }
+}
+
+// -----------------------------------------------------------------------------
+// CostManager and interval handling
+
+// Empirical value to avoid high memory consumption but good for performance.
+#define COST_CACHE_INTERVAL_SIZE_MAX 500
+
+// To perform backward reference every pixel at index index_ is considered and
+// the cost for the MAX_LENGTH following pixels computed. Those following pixels
+// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
+// cost_ = distance cost at index + GetLengthCost(cost_model, k)
+// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
+// array of size MAX_LENGTH.
+// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
+// minimal values using intervals of constant cost.
+// An interval is defined by the index_ of the pixel that generated it and
+// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
+// it contains the minimum value for pixels between start_ and end_.
+// Intervals are stored in a linked list and ordered by start_. When a new
+// interval has a better value, old intervals are split or removed. There are
+// therefore no overlapping intervals.
+typedef struct CostInterval CostInterval;
+struct CostInterval {
+ float cost_;
+ int start_;
+ int end_;
+ int index_;
+ CostInterval* previous_;
+ CostInterval* next_;
+};
+
+// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
+typedef struct {
+ double cost_;
+ int start_;
+ int end_; // Exclusive.
+} CostCacheInterval;
+
+// This structure is in charge of managing intervals and costs.
+// It caches the different CostCacheInterval, caches the different
+// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
+// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
+#define COST_MANAGER_MAX_FREE_LIST 10
+typedef struct {
+ CostInterval* head_;
+ int count_; // The number of stored intervals.
+ CostCacheInterval* cache_intervals_;
+ size_t cache_intervals_size_;
+ double cost_cache_[MAX_LENGTH]; // Contains the GetLengthCost(cost_model, k).
+ float* costs_;
+ uint16_t* dist_array_;
+ // Most of the time, we only need few intervals -> use a free-list, to avoid
+ // fragmentation with small allocs in most common cases.
+ CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
+ CostInterval* free_intervals_;
+ // These are regularly malloc'd remains. This list can't grow larger than than
+ // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
+ CostInterval* recycled_intervals_;
+} CostManager;
+
+static void CostIntervalAddToFreeList(CostManager* const manager,
+ CostInterval* const interval) {
+ interval->next_ = manager->free_intervals_;
+ manager->free_intervals_ = interval;
+}
+
+static int CostIntervalIsInFreeList(const CostManager* const manager,
+ const CostInterval* const interval) {
+ return (interval >= &manager->intervals_[0] &&
+ interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
+}
+
+static void CostManagerInitFreeList(CostManager* const manager) {
+ int i;
+ manager->free_intervals_ = NULL;
+ for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
+ CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
+ }
+}
+
+static void DeleteIntervalList(CostManager* const manager,
+ const CostInterval* interval) {
+ while (interval != NULL) {
+ const CostInterval* const next = interval->next_;
+ if (!CostIntervalIsInFreeList(manager, interval)) {
+ WebPSafeFree((void*)interval);
+ } // else: do nothing
+ interval = next;
+ }
+}
+
+static void CostManagerClear(CostManager* const manager) {
+ if (manager == NULL) return;
+
+ WebPSafeFree(manager->costs_);
+ WebPSafeFree(manager->cache_intervals_);
+
+ // Clear the interval lists.
+ DeleteIntervalList(manager, manager->head_);
+ manager->head_ = NULL;
+ DeleteIntervalList(manager, manager->recycled_intervals_);
+ manager->recycled_intervals_ = NULL;
+
+ // Reset pointers, count_ and cache_intervals_size_.
+ memset(manager, 0, sizeof(*manager));
+ CostManagerInitFreeList(manager);
+}
+
+static int CostManagerInit(CostManager* const manager,
+ uint16_t* const dist_array, int pix_count,
+ const CostModel* const cost_model) {
+ int i;
+ const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
+
+ manager->costs_ = NULL;
+ manager->cache_intervals_ = NULL;
+ manager->head_ = NULL;
+ manager->recycled_intervals_ = NULL;
+ manager->count_ = 0;
+ manager->dist_array_ = dist_array;
+ CostManagerInitFreeList(manager);
+
+ // Fill in the cost_cache_.
+ manager->cache_intervals_size_ = 1;
+ manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
+ for (i = 1; i < cost_cache_size; ++i) {
+ manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+ // Get the number of bound intervals.
+ if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
+ ++manager->cache_intervals_size_;
+ }
+ }
+
+ // With the current cost model, we usually have below 20 intervals.
+ // The worst case scenario with a cost model would be if every length has a
+ // different cost, hence MAX_LENGTH but that is impossible with the current
+ // implementation that spirals around a pixel.
+ assert(manager->cache_intervals_size_ <= MAX_LENGTH);
+ manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
+ manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
+ if (manager->cache_intervals_ == NULL) {
+ CostManagerClear(manager);
+ return 0;
+ }
+
+ // Fill in the cache_intervals_.
+ {
+ CostCacheInterval* cur = manager->cache_intervals_;
+
+ // Consecutive values in cost_cache_ are compared and if a big enough
+ // difference is found, a new interval is created and bounded.
+ cur->start_ = 0;
+ cur->end_ = 1;
+ cur->cost_ = manager->cost_cache_[0];
+ for (i = 1; i < cost_cache_size; ++i) {
+ const double cost_val = manager->cost_cache_[i];
+ if (cost_val != cur->cost_) {
+ ++cur;
+ // Initialize an interval.
+ cur->start_ = i;
+ cur->cost_ = cost_val;
+ }
+ cur->end_ = i + 1;
+ }
+ }
+
+ manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
+ if (manager->costs_ == NULL) {
+ CostManagerClear(manager);
+ return 0;
+ }
+ // Set the initial costs_ high for every pixel as we will keep the minimum.
+ for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+
+ return 1;
+}
+
+// Given the cost and the position that define an interval, update the cost at
+// pixel 'i' if it is smaller than the previously computed value.
+static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
+ int position, float cost) {
+ const int k = i - position;
+ assert(k >= 0 && k < MAX_LENGTH);
+
+ if (manager->costs_[i] > cost) {
+ manager->costs_[i] = cost;
+ manager->dist_array_[i] = k + 1;
+ }
+}
+
+// Given the cost and the position that define an interval, update the cost for
+// all the pixels between 'start' and 'end' excluded.
+static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
+ int start, int end, int position,
+ float cost) {
+ int i;
+ for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
+}
+
+// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
+static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
+ CostInterval* const prev,
+ CostInterval* const next) {
+ if (prev != NULL) {
+ prev->next_ = next;
+ } else {
+ manager->head_ = next;
+ }
+
+ if (next != NULL) next->previous_ = prev;
+}
+
+// Pop an interval in the manager.
+static WEBP_INLINE void PopInterval(CostManager* const manager,
+ CostInterval* const interval) {
+ if (interval == NULL) return;
+
+ ConnectIntervals(manager, interval->previous_, interval->next_);
+ if (CostIntervalIsInFreeList(manager, interval)) {
+ CostIntervalAddToFreeList(manager, interval);
+ } else { // recycle regularly malloc'd intervals too
+ interval->next_ = manager->recycled_intervals_;
+ manager->recycled_intervals_ = interval;
+ }
+ --manager->count_;
+ assert(manager->count_ >= 0);
+}
+
+// Update the cost at index i by going over all the stored intervals that
+// overlap with i.
+// If 'do_clean_intervals' is set to something different than 0, intervals that
+// end before 'i' will be popped.
+static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
+ int do_clean_intervals) {
+ CostInterval* current = manager->head_;
+
+ while (current != NULL && current->start_ <= i) {
+ CostInterval* const next = current->next_;
+ if (current->end_ <= i) {
+ if (do_clean_intervals) {
+ // We have an outdated interval, remove it.
+ PopInterval(manager, current);
+ }
+ } else {
+ UpdateCost(manager, i, current->index_, current->cost_);
+ }
+ current = next;
+ }
+}
+
+// Given a current orphan interval and its previous interval, before
+// it was orphaned (which can be NULL), set it at the right place in the list
+// of intervals using the start_ ordering and the previous interval as a hint.
+static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
+ CostInterval* const current,
+ CostInterval* previous) {
+ assert(current != NULL);
+
+ if (previous == NULL) previous = manager->head_;
+ while (previous != NULL && current->start_ < previous->start_) {
+ previous = previous->previous_;
+ }
+ while (previous != NULL && previous->next_ != NULL &&
+ previous->next_->start_ < current->start_) {
+ previous = previous->next_;
+ }
+
+ if (previous != NULL) {
+ ConnectIntervals(manager, current, previous->next_);
+ } else {
+ ConnectIntervals(manager, current, manager->head_);
+ }
+ ConnectIntervals(manager, previous, current);
+}
+
+// Insert an interval in the list contained in the manager by starting at
+// interval_in as a hint. The intervals are sorted by start_ value.
+static WEBP_INLINE void InsertInterval(CostManager* const manager,
+ CostInterval* const interval_in,
+ float cost, int position, int start,
+ int end) {
+ CostInterval* interval_new;
+
+ if (start >= end) return;
+ if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
+ // Serialize the interval if we cannot store it.
+ UpdateCostPerInterval(manager, start, end, position, cost);
+ return;
+ }
+ if (manager->free_intervals_ != NULL) {
+ interval_new = manager->free_intervals_;
+ manager->free_intervals_ = interval_new->next_;
+ } else if (manager->recycled_intervals_ != NULL) {
+ interval_new = manager->recycled_intervals_;
+ manager->recycled_intervals_ = interval_new->next_;
+ } else { // malloc for good
+ interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
+ if (interval_new == NULL) {
+ // Write down the interval if we cannot create it.
+ UpdateCostPerInterval(manager, start, end, position, cost);
+ return;
+ }
+ }
+
+ interval_new->cost_ = cost;
+ interval_new->index_ = position;
+ interval_new->start_ = start;
+ interval_new->end_ = end;
+ PositionOrphanInterval(manager, interval_new, interval_in);
+
+ ++manager->count_;
+}
+
+// Given a new cost interval defined by its start at position, its length value
+// and distance_cost, add its contributions to the previous intervals and costs.
+// If handling the interval or one of its subintervals becomes to heavy, its
+// contribution is added to the costs right away.
+static WEBP_INLINE void PushInterval(CostManager* const manager,
+ double distance_cost, int position,
+ int len) {
+ size_t i;
+ CostInterval* interval = manager->head_;
+ CostInterval* interval_next;
+ const CostCacheInterval* const cost_cache_intervals =
+ manager->cache_intervals_;
+ // If the interval is small enough, no need to deal with the heavy
+ // interval logic, just serialize it right away. This constant is empirical.
+ const int kSkipDistance = 10;
+
+ if (len < kSkipDistance) {
+ int j;
+ for (j = position; j < position + len; ++j) {
+ const int k = j - position;
+ float cost_tmp;
+ assert(k >= 0 && k < MAX_LENGTH);
+ cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
+
+ if (manager->costs_[j] > cost_tmp) {
+ manager->costs_[j] = cost_tmp;
+ manager->dist_array_[j] = k + 1;
+ }
+ }
+ return;
+ }
+
+ for (i = 0; i < manager->cache_intervals_size_ &&
+ cost_cache_intervals[i].start_ < len;
+ ++i) {
+ // Define the intersection of the ith interval with the new one.
+ int start = position + cost_cache_intervals[i].start_;
+ const int end = position + (cost_cache_intervals[i].end_ > len
+ ? len
+ : cost_cache_intervals[i].end_);
+ const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
+
+ for (; interval != NULL && interval->start_ < end;
+ interval = interval_next) {
+ interval_next = interval->next_;
+
+ // Make sure we have some overlap
+ if (start >= interval->end_) continue;
+
+ if (cost >= interval->cost_) {
+ // When intervals are represented, the lower, the better.
+ // [**********************************************************[
+ // start end
+ // [----------------------------------[
+ // interval->start_ interval->end_
+ // If we are worse than what we already have, add whatever we have so
+ // far up to interval.
+ const int start_new = interval->end_;
+ InsertInterval(manager, interval, cost, position, start,
+ interval->start_);
+ start = start_new;
+ if (start >= end) break;
+ continue;
+ }
+
+ if (start <= interval->start_) {
+ if (interval->end_ <= end) {
+ // [----------------------------------[
+ // interval->start_ interval->end_
+ // [**************************************************************[
+ // start end
+ // We can safely remove the old interval as it is fully included.
+ PopInterval(manager, interval);
+ } else {
+ // [------------------------------------[
+ // interval->start_ interval->end_
+ // [*****************************[
+ // start end
+ interval->start_ = end;
+ break;
+ }
+ } else {
+ if (end < interval->end_) {
+ // [--------------------------------------------------------------[
+ // interval->start_ interval->end_
+ // [*****************************[
+ // start end
+ // We have to split the old interval as it fully contains the new one.
+ const int end_original = interval->end_;
+ interval->end_ = start;
+ InsertInterval(manager, interval, interval->cost_, interval->index_,
+ end, end_original);
+ interval = interval->next_;
+ break;
+ } else {
+ // [------------------------------------[
+ // interval->start_ interval->end_
+ // [*****************************[
+ // start end
+ interval->end_ = start;
+ }
+ }
+ }
+ // Insert the remaining interval from start to end.
+ InsertInterval(manager, interval, cost, position, start, end);
+ }
+}
+
+static int BackwardReferencesHashChainDistanceOnly(
+ int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+ const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
+ uint16_t* const dist_array) {
+ int i;
+ int ok = 0;
+ int cc_init = 0;
+ const int pix_count = xsize * ysize;
+ const int use_color_cache = (cache_bits > 0);
+ const size_t literal_array_size =
+ sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+ ((cache_bits > 0) ? (1 << cache_bits) : 0));
+ const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
+ CostModel* const cost_model =
+ (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
+ VP8LColorCache hashers;
+ CostManager* cost_manager =
+ (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
+ int offset_prev = -1, len_prev = -1;
+ double offset_cost = -1;
+ int first_offset_is_constant = -1; // initialized with 'impossible' value
+ int reach = 0;
+
+ if (cost_model == NULL || cost_manager == NULL) goto Error;
+
+ cost_model->literal_ = (double*)(cost_model + 1);
+ if (use_color_cache) {
+ cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+ if (!cc_init) goto Error;
+ }
+
+ if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
+ goto Error;
+ }
+
+ if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
+ goto Error;
+ }
+
+ // We loop one pixel at a time, but store all currently best points to
+ // non-processed locations from this point.
+ dist_array[0] = 0;
+ // Add first pixel as literal.
+ AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
+ 0.f, cost_manager->costs_, dist_array);
+
+ for (i = 1; i < pix_count; ++i) {
+ const float prev_cost = cost_manager->costs_[i - 1];
+ int offset, len;
+ VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+
+ // Try adding the pixel as a literal.
+ AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
+ use_color_cache, prev_cost,
+ cost_manager->costs_, dist_array);
+
+ // If we are dealing with a non-literal.
+ if (len >= 2) {
+ if (offset != offset_prev) {
+ const int code = VP8LDistanceToPlaneCode(xsize, offset);
+ offset_cost = GetDistanceCost(cost_model, code);
+ first_offset_is_constant = 1;
+ PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+ } else {
+ assert(offset_cost >= 0);
+ assert(len_prev >= 0);
+ assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
+ // Instead of considering all contributions from a pixel i by calling:
+ // PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+ // we optimize these contributions in case offset_cost stays the same
+ // for consecutive pixels. This describes a set of pixels similar to a
+ // previous set (e.g. constant color regions).
+ if (first_offset_is_constant) {
+ reach = i - 1 + len_prev - 1;
+ first_offset_is_constant = 0;
+ }
+
+ if (i + len - 1 > reach) {
+ // We can only be go further with the same offset if the previous
+ // length was maxed, hence len_prev == len == MAX_LENGTH.
+ // TODO(vrabaud), bump i to the end right away (insert cache and
+ // update cost).
+ // TODO(vrabaud), check if one of the points in between does not have
+ // a lower cost.
+ // Already consider the pixel at "reach" to add intervals that are
+ // better than whatever we add.
+ int offset_j, len_j = 0;
+ int j;
+ assert(len == MAX_LENGTH || len == pix_count - i);
+ // Figure out the last consecutive pixel within [i, reach + 1] with
+ // the same offset.
+ for (j = i; j <= reach; ++j) {
+ VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
+ if (offset_j != offset) {
+ VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
+ break;
+ }
+ }
+ // Update the cost at j - 1 and j.
+ UpdateCostAtIndex(cost_manager, j - 1, 0);
+ UpdateCostAtIndex(cost_manager, j, 0);
+
+ PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
+ j, len_j);
+ reach = j + len_j - 1;
+ }
+ }
+ }
+
+ UpdateCostAtIndex(cost_manager, i, 1);
+ offset_prev = offset;
+ len_prev = len;
+ }
+
+ ok = !refs->error_;
+Error:
+ if (cc_init) VP8LColorCacheClear(&hashers);
+ CostManagerClear(cost_manager);
+ WebPSafeFree(cost_model);
+ WebPSafeFree(cost_manager);
+ return ok;
+}
+
+// We pack the path at the end of *dist_array and return
+// a pointer to this part of the array. Example:
+// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
+static void TraceBackwards(uint16_t* const dist_array,
+ int dist_array_size,
+ uint16_t** const chosen_path,
+ int* const chosen_path_size) {
+ uint16_t* path = dist_array + dist_array_size;
+ uint16_t* cur = dist_array + dist_array_size - 1;
+ while (cur >= dist_array) {
+ const int k = *cur;
+ --path;
+ *path = k;
+ cur -= k;
+ }
+ *chosen_path = path;
+ *chosen_path_size = (int)(dist_array + dist_array_size - path);
+}
+
+static int BackwardReferencesHashChainFollowChosenPath(
+ const uint32_t* const argb, int cache_bits,
+ const uint16_t* const chosen_path, int chosen_path_size,
+ const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
+ const int use_color_cache = (cache_bits > 0);
+ int ix;
+ int i = 0;
+ int ok = 0;
+ int cc_init = 0;
+ VP8LColorCache hashers;
+
+ if (use_color_cache) {
+ cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+ if (!cc_init) goto Error;
+ }
+
+ VP8LClearBackwardRefs(refs);
+ for (ix = 0; ix < chosen_path_size; ++ix) {
+ const int len = chosen_path[ix];
+ if (len != 1) {
+ int k;
+ const int offset = VP8LHashChainFindOffset(hash_chain, i);
+ VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+ if (use_color_cache) {
+ for (k = 0; k < len; ++k) {
+ VP8LColorCacheInsert(&hashers, argb[i + k]);
+ }
+ }
+ i += len;
+ } else {
+ PixOrCopy v;
+ const int idx =
+ use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
+ if (idx >= 0) {
+ // use_color_cache is true and hashers contains argb[i]
+ // push pixel as a color cache index
+ v = PixOrCopyCreateCacheIdx(idx);
+ } else {
+ if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+ v = PixOrCopyCreateLiteral(argb[i]);
+ }
+ VP8LBackwardRefsCursorAdd(refs, v);
+ ++i;
+ }
+ }
+ ok = !refs->error_;
+ Error:
+ if (cc_init) VP8LColorCacheClear(&hashers);
+ return ok;
+}
+
+// Returns 1 on success.
+extern int VP8LBackwardReferencesTraceBackwards(
+ int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+ const VP8LHashChain* const hash_chain,
+ const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
+ const uint32_t* const argb,
+ int cache_bits,
+ const VP8LHashChain* const hash_chain,
+ const VP8LBackwardRefs* const refs_src,
+ VP8LBackwardRefs* const refs_dst) {
+ int ok = 0;
+ const int dist_array_size = xsize * ysize;
+ uint16_t* chosen_path = NULL;
+ int chosen_path_size = 0;
+ uint16_t* dist_array =
+ (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
+
+ if (dist_array == NULL) goto Error;
+
+ if (!BackwardReferencesHashChainDistanceOnly(
+ xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
+ goto Error;
+ }
+ TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+ if (!BackwardReferencesHashChainFollowChosenPath(
+ argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
+ refs_dst)) {
+ goto Error;
+ }
+ ok = 1;
+ Error:
+ WebPSafeFree(dist_array);
+ return ok;
+}
diff --git a/media/libwebp/enc/backward_references_enc.c b/media/libwebp/enc/backward_references_enc.c
new file mode 100644
index 0000000000..b78610565a
--- /dev/null
+++ b/media/libwebp/enc/backward_references_enc.c
@@ -0,0 +1,1030 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/utils.h"
+
+#define MIN_BLOCK_SIZE 256 // minimum block size for backward references
+
+#define MAX_ENTROPY (1e30f)
+
+// 1M window (4M bytes) minus 120 special codes for short distances.
+#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
+
+// Minimum number of pixels for which it is cheaper to encode a
+// distance + length instead of each pixel as a literal.
+#define MIN_LENGTH 4
+
+// -----------------------------------------------------------------------------
+
+static const uint8_t plane_to_code_lut[128] = {
+ 96, 73, 55, 39, 23, 13, 5, 1, 255, 255, 255, 255, 255, 255, 255, 255,
+ 101, 78, 58, 42, 26, 16, 8, 2, 0, 3, 9, 17, 27, 43, 59, 79,
+ 102, 86, 62, 46, 32, 20, 10, 6, 4, 7, 11, 21, 33, 47, 63, 87,
+ 105, 90, 70, 52, 37, 28, 18, 14, 12, 15, 19, 29, 38, 53, 71, 91,
+ 110, 99, 82, 66, 48, 35, 30, 24, 22, 25, 31, 36, 49, 67, 83, 100,
+ 115, 108, 94, 76, 64, 50, 44, 40, 34, 41, 45, 51, 65, 77, 95, 109,
+ 118, 113, 103, 92, 80, 68, 60, 56, 54, 57, 61, 69, 81, 93, 104, 114,
+ 119, 116, 111, 106, 97, 88, 84, 74, 72, 75, 85, 89, 98, 107, 112, 117
+};
+
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+int VP8LDistanceToPlaneCode(int xsize, int dist) {
+ const int yoffset = dist / xsize;
+ const int xoffset = dist - yoffset * xsize;
+ if (xoffset <= 8 && yoffset < 8) {
+ return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
+ } else if (xoffset > xsize - 8 && yoffset < 7) {
+ return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
+ }
+ return dist + 120;
+}
+
+// Returns the exact index where array1 and array2 are different. For an index
+// inferior or equal to best_len_match, the return value just has to be strictly
+// inferior to best_len_match. The current behavior is to return 0 if this index
+// is best_len_match, and the index itself otherwise.
+// If no two elements are the same, it returns max_limit.
+static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
+ const uint32_t* const array2,
+ int best_len_match, int max_limit) {
+ // Before 'expensive' linear match, check if the two arrays match at the
+ // current best length index.
+ if (array1[best_len_match] != array2[best_len_match]) return 0;
+
+ return VP8LVectorMismatch(array1, array2, max_limit);
+}
+
+// -----------------------------------------------------------------------------
+// VP8LBackwardRefs
+
+struct PixOrCopyBlock {
+ PixOrCopyBlock* next_; // next block (or NULL)
+ PixOrCopy* start_; // data start
+ int size_; // currently used size
+};
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
+ assert(refs != NULL);
+ if (refs->tail_ != NULL) {
+ *refs->tail_ = refs->free_blocks_; // recycle all blocks at once
+ }
+ refs->free_blocks_ = refs->refs_;
+ refs->tail_ = &refs->refs_;
+ refs->last_block_ = NULL;
+ refs->refs_ = NULL;
+}
+
+void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
+ assert(refs != NULL);
+ VP8LClearBackwardRefs(refs);
+ while (refs->free_blocks_ != NULL) {
+ PixOrCopyBlock* const next = refs->free_blocks_->next_;
+ WebPSafeFree(refs->free_blocks_);
+ refs->free_blocks_ = next;
+ }
+}
+
+// Swaps the content of two VP8LBackwardRefs.
+static void BackwardRefsSwap(VP8LBackwardRefs* const refs1,
+ VP8LBackwardRefs* const refs2) {
+ const int point_to_refs1 =
+ (refs1->tail_ != NULL && refs1->tail_ == &refs1->refs_);
+ const int point_to_refs2 =
+ (refs2->tail_ != NULL && refs2->tail_ == &refs2->refs_);
+ const VP8LBackwardRefs tmp = *refs1;
+ *refs1 = *refs2;
+ *refs2 = tmp;
+ if (point_to_refs2) refs1->tail_ = &refs1->refs_;
+ if (point_to_refs1) refs2->tail_ = &refs2->refs_;
+}
+
+void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
+ assert(refs != NULL);
+ memset(refs, 0, sizeof(*refs));
+ refs->tail_ = &refs->refs_;
+ refs->block_size_ =
+ (block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
+}
+
+VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
+ VP8LRefsCursor c;
+ c.cur_block_ = refs->refs_;
+ if (refs->refs_ != NULL) {
+ c.cur_pos = c.cur_block_->start_;
+ c.last_pos_ = c.cur_pos + c.cur_block_->size_;
+ } else {
+ c.cur_pos = NULL;
+ c.last_pos_ = NULL;
+ }
+ return c;
+}
+
+void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
+ PixOrCopyBlock* const b = c->cur_block_->next_;
+ c->cur_pos = (b == NULL) ? NULL : b->start_;
+ c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
+ c->cur_block_ = b;
+}
+
+// Create a new block, either from the free list or allocated
+static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
+ PixOrCopyBlock* b = refs->free_blocks_;
+ if (b == NULL) { // allocate new memory chunk
+ const size_t total_size =
+ sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
+ b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
+ if (b == NULL) {
+ refs->error_ |= 1;
+ return NULL;
+ }
+ b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b)); // not always aligned
+ } else { // recycle from free-list
+ refs->free_blocks_ = b->next_;
+ }
+ *refs->tail_ = b;
+ refs->tail_ = &b->next_;
+ refs->last_block_ = b;
+ b->next_ = NULL;
+ b->size_ = 0;
+ return b;
+}
+
+// Return 1 on success, 0 on error.
+static int BackwardRefsClone(const VP8LBackwardRefs* const from,
+ VP8LBackwardRefs* const to) {
+ const PixOrCopyBlock* block_from = from->refs_;
+ VP8LClearBackwardRefs(to);
+ while (block_from != NULL) {
+ PixOrCopyBlock* const block_to = BackwardRefsNewBlock(to);
+ if (block_to == NULL) return 0;
+ memcpy(block_to->start_, block_from->start_,
+ block_from->size_ * sizeof(PixOrCopy));
+ block_to->size_ = block_from->size_;
+ block_from = block_from->next_;
+ }
+ return 1;
+}
+
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+ const PixOrCopy v);
+void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+ const PixOrCopy v) {
+ PixOrCopyBlock* b = refs->last_block_;
+ if (b == NULL || b->size_ == refs->block_size_) {
+ b = BackwardRefsNewBlock(refs);
+ if (b == NULL) return; // refs->error_ is set
+ }
+ b->start_[b->size_++] = v;
+}
+
+// -----------------------------------------------------------------------------
+// Hash chains
+
+int VP8LHashChainInit(VP8LHashChain* const p, int size) {
+ assert(p->size_ == 0);
+ assert(p->offset_length_ == NULL);
+ assert(size > 0);
+ p->offset_length_ =
+ (uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
+ if (p->offset_length_ == NULL) return 0;
+ p->size_ = size;
+
+ return 1;
+}
+
+void VP8LHashChainClear(VP8LHashChain* const p) {
+ assert(p != NULL);
+ WebPSafeFree(p->offset_length_);
+
+ p->size_ = 0;
+ p->offset_length_ = NULL;
+}
+
+// -----------------------------------------------------------------------------
+
+static const uint32_t kHashMultiplierHi = 0xc6a4a793u;
+static const uint32_t kHashMultiplierLo = 0x5bd1e996u;
+
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+uint32_t GetPixPairHash64(const uint32_t* const argb) {
+ uint32_t key;
+ key = argb[1] * kHashMultiplierHi;
+ key += argb[0] * kHashMultiplierLo;
+ key = key >> (32 - HASH_BITS);
+ return key;
+}
+
+// Returns the maximum number of hash chain lookups to do for a
+// given compression quality. Return value in range [8, 86].
+static int GetMaxItersForQuality(int quality) {
+ return 8 + (quality * quality) / 128;
+}
+
+static int GetWindowSizeForHashChain(int quality, int xsize) {
+ const int max_window_size = (quality > 75) ? WINDOW_SIZE
+ : (quality > 50) ? (xsize << 8)
+ : (quality > 25) ? (xsize << 6)
+ : (xsize << 4);
+ assert(xsize > 0);
+ return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
+}
+
+static WEBP_INLINE int MaxFindCopyLength(int len) {
+ return (len < MAX_LENGTH) ? len : MAX_LENGTH;
+}
+
+int VP8LHashChainFill(VP8LHashChain* const p, int quality,
+ const uint32_t* const argb, int xsize, int ysize,
+ int low_effort) {
+ const int size = xsize * ysize;
+ const int iter_max = GetMaxItersForQuality(quality);
+ const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
+ int pos;
+ int argb_comp;
+ uint32_t base_position;
+ int32_t* hash_to_first_index;
+ // Temporarily use the p->offset_length_ as a hash chain.
+ int32_t* chain = (int32_t*)p->offset_length_;
+ assert(size > 0);
+ assert(p->size_ != 0);
+ assert(p->offset_length_ != NULL);
+
+ if (size <= 2) {
+ p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+ return 1;
+ }
+
+ hash_to_first_index =
+ (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
+ if (hash_to_first_index == NULL) return 0;
+
+ // Set the int32_t array to -1.
+ memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
+ // Fill the chain linking pixels with the same hash.
+ argb_comp = (argb[0] == argb[1]);
+ for (pos = 0; pos < size - 2;) {
+ uint32_t hash_code;
+ const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
+ if (argb_comp && argb_comp_next) {
+ // Consecutive pixels with the same color will share the same hash.
+ // We therefore use a different hash: the color and its repetition
+ // length.
+ uint32_t tmp[2];
+ uint32_t len = 1;
+ tmp[0] = argb[pos];
+ // Figure out how far the pixels are the same.
+ // The last pixel has a different 64 bit hash, as its next pixel does
+ // not have the same color, so we just need to get to the last pixel equal
+ // to its follower.
+ while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
+ ++len;
+ }
+ if (len > MAX_LENGTH) {
+ // Skip the pixels that match for distance=1 and length>MAX_LENGTH
+ // because they are linked to their predecessor and we automatically
+ // check that in the main for loop below. Skipping means setting no
+ // predecessor in the chain, hence -1.
+ memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
+ pos += len - MAX_LENGTH;
+ len = MAX_LENGTH;
+ }
+ // Process the rest of the hash chain.
+ while (len) {
+ tmp[1] = len--;
+ hash_code = GetPixPairHash64(tmp);
+ chain[pos] = hash_to_first_index[hash_code];
+ hash_to_first_index[hash_code] = pos++;
+ }
+ argb_comp = 0;
+ } else {
+ // Just move one pixel forward.
+ hash_code = GetPixPairHash64(argb + pos);
+ chain[pos] = hash_to_first_index[hash_code];
+ hash_to_first_index[hash_code] = pos++;
+ argb_comp = argb_comp_next;
+ }
+ }
+ // Process the penultimate pixel.
+ chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
+
+ WebPSafeFree(hash_to_first_index);
+
+ // Find the best match interval at each pixel, defined by an offset to the
+ // pixel and a length. The right-most pixel cannot match anything to the right
+ // (hence a best length of 0) and the left-most pixel nothing to the left
+ // (hence an offset of 0).
+ assert(size > 2);
+ p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+ for (base_position = size - 2; base_position > 0;) {
+ const int max_len = MaxFindCopyLength(size - 1 - base_position);
+ const uint32_t* const argb_start = argb + base_position;
+ int iter = iter_max;
+ int best_length = 0;
+ uint32_t best_distance = 0;
+ uint32_t best_argb;
+ const int min_pos =
+ (base_position > window_size) ? base_position - window_size : 0;
+ const int length_max = (max_len < 256) ? max_len : 256;
+ uint32_t max_base_position;
+
+ pos = chain[base_position];
+ if (!low_effort) {
+ int curr_length;
+ // Heuristic: use the comparison with the above line as an initialization.
+ if (base_position >= (uint32_t)xsize) {
+ curr_length = FindMatchLength(argb_start - xsize, argb_start,
+ best_length, max_len);
+ if (curr_length > best_length) {
+ best_length = curr_length;
+ best_distance = xsize;
+ }
+ --iter;
+ }
+ // Heuristic: compare to the previous pixel.
+ curr_length =
+ FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
+ if (curr_length > best_length) {
+ best_length = curr_length;
+ best_distance = 1;
+ }
+ --iter;
+ // Skip the for loop if we already have the maximum.
+ if (best_length == MAX_LENGTH) pos = min_pos - 1;
+ }
+ best_argb = argb_start[best_length];
+
+ for (; pos >= min_pos && --iter; pos = chain[pos]) {
+ int curr_length;
+ assert(base_position > (uint32_t)pos);
+
+ if (argb[pos + best_length] != best_argb) continue;
+
+ curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
+ if (best_length < curr_length) {
+ best_length = curr_length;
+ best_distance = base_position - pos;
+ best_argb = argb_start[best_length];
+ // Stop if we have reached a good enough length.
+ if (best_length >= length_max) break;
+ }
+ }
+ // We have the best match but in case the two intervals continue matching
+ // to the left, we have the best matches for the left-extended pixels.
+ max_base_position = base_position;
+ while (1) {
+ assert(best_length <= MAX_LENGTH);
+ assert(best_distance <= WINDOW_SIZE);
+ p->offset_length_[base_position] =
+ (best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
+ --base_position;
+ // Stop if we don't have a match or if we are out of bounds.
+ if (best_distance == 0 || base_position == 0) break;
+ // Stop if we cannot extend the matching intervals to the left.
+ if (base_position < best_distance ||
+ argb[base_position - best_distance] != argb[base_position]) {
+ break;
+ }
+ // Stop if we are matching at its limit because there could be a closer
+ // matching interval with the same maximum length. Then again, if the
+ // matching interval is as close as possible (best_distance == 1), we will
+ // never find anything better so let's continue.
+ if (best_length == MAX_LENGTH && best_distance != 1 &&
+ base_position + MAX_LENGTH < max_base_position) {
+ break;
+ }
+ if (best_length < MAX_LENGTH) {
+ ++best_length;
+ max_base_position = base_position;
+ }
+ }
+ }
+ return 1;
+}
+
+static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
+ VP8LColorCache* const hashers,
+ VP8LBackwardRefs* const refs) {
+ PixOrCopy v;
+ if (use_color_cache) {
+ const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
+ if (VP8LColorCacheLookup(hashers, key) == pixel) {
+ v = PixOrCopyCreateCacheIdx(key);
+ } else {
+ v = PixOrCopyCreateLiteral(pixel);
+ VP8LColorCacheSet(hashers, key, pixel);
+ }
+ } else {
+ v = PixOrCopyCreateLiteral(pixel);
+ }
+ VP8LBackwardRefsCursorAdd(refs, v);
+}
+
+static int BackwardReferencesRle(int xsize, int ysize,
+ const uint32_t* const argb,
+ int cache_bits, VP8LBackwardRefs* const refs) {
+ const int pix_count = xsize * ysize;
+ int i, k;
+ const int use_color_cache = (cache_bits > 0);
+ VP8LColorCache hashers;
+
+ if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
+ return 0;
+ }
+ VP8LClearBackwardRefs(refs);
+ // Add first pixel as literal.
+ AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
+ i = 1;
+ while (i < pix_count) {
+ const int max_len = MaxFindCopyLength(pix_count - i);
+ const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
+ const int prev_row_len = (i < xsize) ? 0 :
+ FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
+ if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
+ VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
+ // We don't need to update the color cache here since it is always the
+ // same pixel being copied, and that does not change the color cache
+ // state.
+ i += rle_len;
+ } else if (prev_row_len >= MIN_LENGTH) {
+ VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
+ if (use_color_cache) {
+ for (k = 0; k < prev_row_len; ++k) {
+ VP8LColorCacheInsert(&hashers, argb[i + k]);
+ }
+ }
+ i += prev_row_len;
+ } else {
+ AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+ i++;
+ }
+ }
+ if (use_color_cache) VP8LColorCacheClear(&hashers);
+ return !refs->error_;
+}
+
+static int BackwardReferencesLz77(int xsize, int ysize,
+ const uint32_t* const argb, int cache_bits,
+ const VP8LHashChain* const hash_chain,
+ VP8LBackwardRefs* const refs) {
+ int i;
+ int i_last_check = -1;
+ int ok = 0;
+ int cc_init = 0;
+ const int use_color_cache = (cache_bits > 0);
+ const int pix_count = xsize * ysize;
+ VP8LColorCache hashers;
+
+ if (use_color_cache) {
+ cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+ if (!cc_init) goto Error;
+ }
+ VP8LClearBackwardRefs(refs);
+ for (i = 0; i < pix_count;) {
+ // Alternative#1: Code the pixels starting at 'i' using backward reference.
+ int offset = 0;
+ int len = 0;
+ int j;
+ VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+ if (len >= MIN_LENGTH) {
+ const int len_ini = len;
+ int max_reach = 0;
+ const int j_max =
+ (i + len_ini >= pix_count) ? pix_count - 1 : i + len_ini;
+ // Only start from what we have not checked already.
+ i_last_check = (i > i_last_check) ? i : i_last_check;
+ // We know the best match for the current pixel but we try to find the
+ // best matches for the current pixel AND the next one combined.
+ // The naive method would use the intervals:
+ // [i,i+len) + [i+len, length of best match at i+len)
+ // while we check if we can use:
+ // [i,j) (where j<=i+len) + [j, length of best match at j)
+ for (j = i_last_check + 1; j <= j_max; ++j) {
+ const int len_j = VP8LHashChainFindLength(hash_chain, j);
+ const int reach =
+ j + (len_j >= MIN_LENGTH ? len_j : 1); // 1 for single literal.
+ if (reach > max_reach) {
+ len = j - i;
+ max_reach = reach;
+ if (max_reach >= pix_count) break;
+ }
+ }
+ } else {
+ len = 1;
+ }
+ // Go with literal or backward reference.
+ assert(len > 0);
+ if (len == 1) {
+ AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+ } else {
+ VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+ if (use_color_cache) {
+ for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
+ }
+ }
+ i += len;
+ }
+
+ ok = !refs->error_;
+ Error:
+ if (cc_init) VP8LColorCacheClear(&hashers);
+ return ok;
+}
+
+// Compute an LZ77 by forcing matches to happen within a given distance cost.
+// We therefore limit the algorithm to the lowest 32 values in the PlaneCode
+// definition.
+#define WINDOW_OFFSETS_SIZE_MAX 32
+static int BackwardReferencesLz77Box(int xsize, int ysize,
+ const uint32_t* const argb, int cache_bits,
+ const VP8LHashChain* const hash_chain_best,
+ VP8LHashChain* hash_chain,
+ VP8LBackwardRefs* const refs) {
+ int i;
+ const int pix_count = xsize * ysize;
+ uint16_t* counts;
+ int window_offsets[WINDOW_OFFSETS_SIZE_MAX] = {0};
+ int window_offsets_new[WINDOW_OFFSETS_SIZE_MAX] = {0};
+ int window_offsets_size = 0;
+ int window_offsets_new_size = 0;
+ uint16_t* const counts_ini =
+ (uint16_t*)WebPSafeMalloc(xsize * ysize, sizeof(*counts_ini));
+ int best_offset_prev = -1, best_length_prev = -1;
+ if (counts_ini == NULL) return 0;
+
+ // counts[i] counts how many times a pixel is repeated starting at position i.
+ i = pix_count - 2;
+ counts = counts_ini + i;
+ counts[1] = 1;
+ for (; i >= 0; --i, --counts) {
+ if (argb[i] == argb[i + 1]) {
+ // Max out the counts to MAX_LENGTH.
+ counts[0] = counts[1] + (counts[1] != MAX_LENGTH);
+ } else {
+ counts[0] = 1;
+ }
+ }
+
+ // Figure out the window offsets around a pixel. They are stored in a
+ // spiraling order around the pixel as defined by VP8LDistanceToPlaneCode.
+ {
+ int x, y;
+ for (y = 0; y <= 6; ++y) {
+ for (x = -6; x <= 6; ++x) {
+ const int offset = y * xsize + x;
+ int plane_code;
+ // Ignore offsets that bring us after the pixel.
+ if (offset <= 0) continue;
+ plane_code = VP8LDistanceToPlaneCode(xsize, offset) - 1;
+ if (plane_code >= WINDOW_OFFSETS_SIZE_MAX) continue;
+ window_offsets[plane_code] = offset;
+ }
+ }
+ // For narrow images, not all plane codes are reached, so remove those.
+ for (i = 0; i < WINDOW_OFFSETS_SIZE_MAX; ++i) {
+ if (window_offsets[i] == 0) continue;
+ window_offsets[window_offsets_size++] = window_offsets[i];
+ }
+ // Given a pixel P, find the offsets that reach pixels unreachable from P-1
+ // with any of the offsets in window_offsets[].
+ for (i = 0; i < window_offsets_size; ++i) {
+ int j;
+ int is_reachable = 0;
+ for (j = 0; j < window_offsets_size && !is_reachable; ++j) {
+ is_reachable |= (window_offsets[i] == window_offsets[j] + 1);
+ }
+ if (!is_reachable) {
+ window_offsets_new[window_offsets_new_size] = window_offsets[i];
+ ++window_offsets_new_size;
+ }
+ }
+ }
+
+ hash_chain->offset_length_[0] = 0;
+ for (i = 1; i < pix_count; ++i) {
+ int ind;
+ int best_length = VP8LHashChainFindLength(hash_chain_best, i);
+ int best_offset;
+ int do_compute = 1;
+
+ if (best_length >= MAX_LENGTH) {
+ // Do not recompute the best match if we already have a maximal one in the
+ // window.
+ best_offset = VP8LHashChainFindOffset(hash_chain_best, i);
+ for (ind = 0; ind < window_offsets_size; ++ind) {
+ if (best_offset == window_offsets[ind]) {
+ do_compute = 0;
+ break;
+ }
+ }
+ }
+ if (do_compute) {
+ // Figure out if we should use the offset/length from the previous pixel
+ // as an initial guess and therefore only inspect the offsets in
+ // window_offsets_new[].
+ const int use_prev =
+ (best_length_prev > 1) && (best_length_prev < MAX_LENGTH);
+ const int num_ind =
+ use_prev ? window_offsets_new_size : window_offsets_size;
+ best_length = use_prev ? best_length_prev - 1 : 0;
+ best_offset = use_prev ? best_offset_prev : 0;
+ // Find the longest match in a window around the pixel.
+ for (ind = 0; ind < num_ind; ++ind) {
+ int curr_length = 0;
+ int j = i;
+ int j_offset =
+ use_prev ? i - window_offsets_new[ind] : i - window_offsets[ind];
+ if (j_offset < 0 || argb[j_offset] != argb[i]) continue;
+ // The longest match is the sum of how many times each pixel is
+ // repeated.
+ do {
+ const int counts_j_offset = counts_ini[j_offset];
+ const int counts_j = counts_ini[j];
+ if (counts_j_offset != counts_j) {
+ curr_length +=
+ (counts_j_offset < counts_j) ? counts_j_offset : counts_j;
+ break;
+ }
+ // The same color is repeated counts_pos times at j_offset and j.
+ curr_length += counts_j_offset;
+ j_offset += counts_j_offset;
+ j += counts_j_offset;
+ } while (curr_length <= MAX_LENGTH && j < pix_count &&
+ argb[j_offset] == argb[j]);
+ if (best_length < curr_length) {
+ best_offset =
+ use_prev ? window_offsets_new[ind] : window_offsets[ind];
+ if (curr_length >= MAX_LENGTH) {
+ best_length = MAX_LENGTH;
+ break;
+ } else {
+ best_length = curr_length;
+ }
+ }
+ }
+ }
+
+ assert(i + best_length <= pix_count);
+ assert(best_length <= MAX_LENGTH);
+ if (best_length <= MIN_LENGTH) {
+ hash_chain->offset_length_[i] = 0;
+ best_offset_prev = 0;
+ best_length_prev = 0;
+ } else {
+ hash_chain->offset_length_[i] =
+ (best_offset << MAX_LENGTH_BITS) | (uint32_t)best_length;
+ best_offset_prev = best_offset;
+ best_length_prev = best_length;
+ }
+ }
+ hash_chain->offset_length_[0] = 0;
+ WebPSafeFree(counts_ini);
+
+ return BackwardReferencesLz77(xsize, ysize, argb, cache_bits, hash_chain,
+ refs);
+}
+
+// -----------------------------------------------------------------------------
+
+static void BackwardReferences2DLocality(int xsize,
+ const VP8LBackwardRefs* const refs) {
+ VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+ while (VP8LRefsCursorOk(&c)) {
+ if (PixOrCopyIsCopy(c.cur_pos)) {
+ const int dist = c.cur_pos->argb_or_distance;
+ const int transformed_dist = VP8LDistanceToPlaneCode(xsize, dist);
+ c.cur_pos->argb_or_distance = transformed_dist;
+ }
+ VP8LRefsCursorNext(&c);
+ }
+}
+
+// Evaluate optimal cache bits for the local color cache.
+// The input *best_cache_bits sets the maximum cache bits to use (passing 0
+// implies disabling the local color cache). The local color cache is also
+// disabled for the lower (<= 25) quality.
+// Returns 0 in case of memory error.
+static int CalculateBestCacheSize(const uint32_t* argb, int quality,
+ const VP8LBackwardRefs* const refs,
+ int* const best_cache_bits) {
+ int i;
+ const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
+ double entropy_min = MAX_ENTROPY;
+ int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
+ VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
+ VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+ VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
+ int ok = 0;
+
+ assert(cache_bits_max >= 0 && cache_bits_max <= MAX_COLOR_CACHE_BITS);
+
+ if (cache_bits_max == 0) {
+ *best_cache_bits = 0;
+ // Local color cache is disabled.
+ return 1;
+ }
+
+ // Allocate data.
+ for (i = 0; i <= cache_bits_max; ++i) {
+ histos[i] = VP8LAllocateHistogram(i);
+ if (histos[i] == NULL) goto Error;
+ VP8LHistogramInit(histos[i], i, /*init_arrays=*/ 1);
+ if (i == 0) continue;
+ cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
+ if (!cc_init[i]) goto Error;
+ }
+
+ // Find the cache_bits giving the lowest entropy. The search is done in a
+ // brute-force way as the function (entropy w.r.t cache_bits) can be
+ // anything in practice.
+ while (VP8LRefsCursorOk(&c)) {
+ const PixOrCopy* const v = c.cur_pos;
+ if (PixOrCopyIsLiteral(v)) {
+ const uint32_t pix = *argb++;
+ const uint32_t a = (pix >> 24) & 0xff;
+ const uint32_t r = (pix >> 16) & 0xff;
+ const uint32_t g = (pix >> 8) & 0xff;
+ const uint32_t b = (pix >> 0) & 0xff;
+ // The keys of the caches can be derived from the longest one.
+ int key = VP8LHashPix(pix, 32 - cache_bits_max);
+ // Do not use the color cache for cache_bits = 0.
+ ++histos[0]->blue_[b];
+ ++histos[0]->literal_[g];
+ ++histos[0]->red_[r];
+ ++histos[0]->alpha_[a];
+ // Deal with cache_bits > 0.
+ for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+ if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
+ ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+ } else {
+ VP8LColorCacheSet(&hashers[i], key, pix);
+ ++histos[i]->blue_[b];
+ ++histos[i]->literal_[g];
+ ++histos[i]->red_[r];
+ ++histos[i]->alpha_[a];
+ }
+ }
+ } else {
+ int code, extra_bits, extra_bits_value;
+ // We should compute the contribution of the (distance,length)
+ // histograms but those are the same independently from the cache size.
+ // As those constant contributions are in the end added to the other
+ // histogram contributions, we can ignore them, except for the length
+ // prefix that is part of the literal_ histogram.
+ int len = PixOrCopyLength(v);
+ uint32_t argb_prev = *argb ^ 0xffffffffu;
+ VP8LPrefixEncode(len, &code, &extra_bits, &extra_bits_value);
+ for (i = 0; i <= cache_bits_max; ++i) {
+ ++histos[i]->literal_[NUM_LITERAL_CODES + code];
+ }
+ // Update the color caches.
+ do {
+ if (*argb != argb_prev) {
+ // Efficiency: insert only if the color changes.
+ int key = VP8LHashPix(*argb, 32 - cache_bits_max);
+ for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+ hashers[i].colors_[key] = *argb;
+ }
+ argb_prev = *argb;
+ }
+ argb++;
+ } while (--len != 0);
+ }
+ VP8LRefsCursorNext(&c);
+ }
+
+ for (i = 0; i <= cache_bits_max; ++i) {
+ const double entropy = VP8LHistogramEstimateBits(histos[i]);
+ if (i == 0 || entropy < entropy_min) {
+ entropy_min = entropy;
+ *best_cache_bits = i;
+ }
+ }
+ ok = 1;
+Error:
+ for (i = 0; i <= cache_bits_max; ++i) {
+ if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
+ VP8LFreeHistogram(histos[i]);
+ }
+ return ok;
+}
+
+// Update (in-place) backward references for specified cache_bits.
+static int BackwardRefsWithLocalCache(const uint32_t* const argb,
+ int cache_bits,
+ VP8LBackwardRefs* const refs) {
+ int pixel_index = 0;
+ VP8LColorCache hashers;
+ VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+ if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
+
+ while (VP8LRefsCursorOk(&c)) {
+ PixOrCopy* const v = c.cur_pos;
+ if (PixOrCopyIsLiteral(v)) {
+ const uint32_t argb_literal = v->argb_or_distance;
+ const int ix = VP8LColorCacheContains(&hashers, argb_literal);
+ if (ix >= 0) {
+ // hashers contains argb_literal
+ *v = PixOrCopyCreateCacheIdx(ix);
+ } else {
+ VP8LColorCacheInsert(&hashers, argb_literal);
+ }
+ ++pixel_index;
+ } else {
+ // refs was created without local cache, so it can not have cache indexes.
+ int k;
+ assert(PixOrCopyIsCopy(v));
+ for (k = 0; k < v->len; ++k) {
+ VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
+ }
+ }
+ VP8LRefsCursorNext(&c);
+ }
+ VP8LColorCacheClear(&hashers);
+ return 1;
+}
+
+static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
+ int width, int height, const uint32_t* const argb,
+ int* const cache_bits, const VP8LHashChain* const hash_chain,
+ VP8LBackwardRefs* const refs_lz77) {
+ *cache_bits = 0;
+ if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
+ return NULL;
+ }
+ BackwardReferences2DLocality(width, refs_lz77);
+ return refs_lz77;
+}
+
+extern int VP8LBackwardReferencesTraceBackwards(
+ int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+ const VP8LHashChain* const hash_chain,
+ const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+static int GetBackwardReferences(int width, int height,
+ const uint32_t* const argb, int quality,
+ int lz77_types_to_try, int cache_bits_max,
+ int do_no_cache,
+ const VP8LHashChain* const hash_chain,
+ VP8LBackwardRefs* const refs,
+ int* const cache_bits_best) {
+ VP8LHistogram* histo = NULL;
+ int i, lz77_type;
+ // Index 0 is for a color cache, index 1 for no cache (if needed).
+ int lz77_types_best[2] = {0, 0};
+ double bit_costs_best[2] = {DBL_MAX, DBL_MAX};
+ VP8LHashChain hash_chain_box;
+ VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
+ int status = 0;
+ memset(&hash_chain_box, 0, sizeof(hash_chain_box));
+
+ histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
+ if (histo == NULL) goto Error;
+
+ for (lz77_type = 1; lz77_types_to_try;
+ lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
+ int res = 0;
+ double bit_cost = 0.;
+ if ((lz77_types_to_try & lz77_type) == 0) continue;
+ switch (lz77_type) {
+ case kLZ77RLE:
+ res = BackwardReferencesRle(width, height, argb, 0, refs_tmp);
+ break;
+ case kLZ77Standard:
+ // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
+ // cache is not that different in practice.
+ res = BackwardReferencesLz77(width, height, argb, 0, hash_chain,
+ refs_tmp);
+ break;
+ case kLZ77Box:
+ if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
+ res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
+ &hash_chain_box, refs_tmp);
+ break;
+ default:
+ assert(0);
+ }
+ if (!res) goto Error;
+
+ // Start with the no color cache case.
+ for (i = 1; i >= 0; --i) {
+ int cache_bits = (i == 1) ? 0 : cache_bits_max;
+
+ if (i == 1 && !do_no_cache) continue;
+
+ if (i == 0) {
+ // Try with a color cache.
+ if (!CalculateBestCacheSize(argb, quality, refs_tmp, &cache_bits)) {
+ goto Error;
+ }
+ if (cache_bits > 0) {
+ if (!BackwardRefsWithLocalCache(argb, cache_bits, refs_tmp)) {
+ goto Error;
+ }
+ }
+ }
+
+ if (i == 0 && do_no_cache && cache_bits == 0) {
+ // No need to re-compute bit_cost as it was computed at i == 1.
+ } else {
+ VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+ bit_cost = VP8LHistogramEstimateBits(histo);
+ }
+
+ if (bit_cost < bit_costs_best[i]) {
+ if (i == 1) {
+ // Do not swap as the full cache analysis would have the wrong
+ // VP8LBackwardRefs to start with.
+ if (!BackwardRefsClone(refs_tmp, &refs[1])) goto Error;
+ } else {
+ BackwardRefsSwap(refs_tmp, &refs[0]);
+ }
+ bit_costs_best[i] = bit_cost;
+ lz77_types_best[i] = lz77_type;
+ if (i == 0) *cache_bits_best = cache_bits;
+ }
+ }
+ }
+ assert(lz77_types_best[0] > 0);
+ assert(!do_no_cache || lz77_types_best[1] > 0);
+
+ // Improve on simple LZ77 but only for high quality (TraceBackwards is
+ // costly).
+ for (i = 1; i >= 0; --i) {
+ if (i == 1 && !do_no_cache) continue;
+ if ((lz77_types_best[i] == kLZ77Standard ||
+ lz77_types_best[i] == kLZ77Box) &&
+ quality >= 25) {
+ const VP8LHashChain* const hash_chain_tmp =
+ (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
+ const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
+ if (VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
+ hash_chain_tmp, &refs[i],
+ refs_tmp)) {
+ double bit_cost_trace;
+ VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+ bit_cost_trace = VP8LHistogramEstimateBits(histo);
+ if (bit_cost_trace < bit_costs_best[i]) {
+ BackwardRefsSwap(refs_tmp, &refs[i]);
+ }
+ }
+ }
+
+ BackwardReferences2DLocality(width, &refs[i]);
+
+ if (i == 1 && lz77_types_best[0] == lz77_types_best[1] &&
+ *cache_bits_best == 0) {
+ // If the best cache size is 0 and we have the same best LZ77, just copy
+ // the data over and stop here.
+ if (!BackwardRefsClone(&refs[1], &refs[0])) goto Error;
+ break;
+ }
+ }
+ status = 1;
+
+Error:
+ VP8LHashChainClear(&hash_chain_box);
+ VP8LFreeHistogram(histo);
+ return status;
+}
+
+WebPEncodingError VP8LGetBackwardReferences(
+ int width, int height, const uint32_t* const argb, int quality,
+ int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+ const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+ int* const cache_bits_best) {
+ if (low_effort) {
+ VP8LBackwardRefs* refs_best;
+ *cache_bits_best = cache_bits_max;
+ refs_best = GetBackwardReferencesLowEffort(
+ width, height, argb, cache_bits_best, hash_chain, refs);
+ if (refs_best == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+ // Set it in first position.
+ BackwardRefsSwap(refs_best, &refs[0]);
+ } else {
+ if (!GetBackwardReferences(width, height, argb, quality, lz77_types_to_try,
+ cache_bits_max, do_no_cache, hash_chain, refs,
+ cache_bits_best)) {
+ return VP8_ENC_ERROR_OUT_OF_MEMORY;
+ }
+ }
+ return VP8_ENC_OK;
+}
diff --git a/media/libwebp/enc/backward_references_enc.h b/media/libwebp/enc/backward_references_enc.h
index 539e991cfc..292c630e5e 100644
--- a/media/libwebp/enc/backward_references_enc.h
+++ b/media/libwebp/enc/backward_references_enc.h
@@ -16,6 +16,7 @@
#include <assert.h>
#include <stdlib.h>
#include "../webp/types.h"
+#include "../webp/encode.h"
#include "../webp/format_constants.h"
#ifdef __cplusplus
@@ -218,14 +219,19 @@ enum VP8LLZ77Type {
// Evaluates best possible backward references for specified quality.
// The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
// bits to use (passing 0 implies disabling the local color cache).
-// The optimal cache bits is evaluated and set for the *cache_bits parameter.
-// The return value is the pointer to the best of the two backward refs viz,
-// refs[0] or refs[1].
-VP8LBackwardRefs* VP8LGetBackwardReferences(
+// The optimal cache bits is evaluated and set for the *cache_bits_best
+// parameter with the matching refs_best.
+// If do_no_cache == 0, refs is an array of 2 values and the best
+// VP8LBackwardRefs is put in the first element.
+// If do_no_cache != 0, refs is an array of 3 values and the best
+// VP8LBackwardRefs is put in the first element, the best value with no-cache in
+// the second element.
+// In both cases, the last element is used as temporary internally.
+WebPEncodingError VP8LGetBackwardReferences(
int width, int height, const uint32_t* const argb, int quality,
- int low_effort, int lz77_types_to_try, int* const cache_bits,
- const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
- VP8LBackwardRefs* const refs_tmp2);
+ int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+ const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+ int* const cache_bits_best);
#ifdef __cplusplus
}
diff --git a/media/libwebp/enc/config_enc.c b/media/libwebp/enc/config_enc.c
new file mode 100644
index 0000000000..97df32d0d4
--- /dev/null
+++ b/media/libwebp/enc/config_enc.c
@@ -0,0 +1,157 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Coding tools configuration
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include "../webp/encode.h"
+
+//------------------------------------------------------------------------------
+// WebPConfig
+//------------------------------------------------------------------------------
+
+int WebPConfigInitInternal(WebPConfig* config,
+ WebPPreset preset, float quality, int version) {
+ if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
+ return 0; // caller/system version mismatch!
+ }
+ if (config == NULL) return 0;
+
+ config->quality = quality;
+ config->target_size = 0;
+ config->target_PSNR = 0.;
+ config->method = 4;
+ config->sns_strength = 50;
+ config->filter_strength = 60; // mid-filtering
+ config->filter_sharpness = 0;
+ config->filter_type = 1; // default: strong (so U/V is filtered too)
+ config->partitions = 0;
+ config->segments = 4;
+ config->pass = 1;
+ config->qmin = 0;
+ config->qmax = 100;
+ config->show_compressed = 0;
+ config->preprocessing = 0;
+ config->autofilter = 0;
+ config->partition_limit = 0;
+ config->alpha_compression = 1;
+ config->alpha_filtering = 1;
+ config->alpha_quality = 100;
+ config->lossless = 0;
+ config->exact = 0;
+ config->image_hint = WEBP_HINT_DEFAULT;
+ config->emulate_jpeg_size = 0;
+ config->thread_level = 0;
+ config->low_memory = 0;
+ config->near_lossless = 100;
+ config->use_delta_palette = 0;
+ config->use_sharp_yuv = 0;
+
+ // TODO(skal): tune.
+ switch (preset) {
+ case WEBP_PRESET_PICTURE:
+ config->sns_strength = 80;
+ config->filter_sharpness = 4;
+ config->filter_strength = 35;
+ config->preprocessing &= ~2; // no dithering
+ break;
+ case WEBP_PRESET_PHOTO:
+ config->sns_strength = 80;
+ config->filter_sharpness = 3;
+ config->filter_strength = 30;
+ config->preprocessing |= 2;
+ break;
+ case WEBP_PRESET_DRAWING:
+ config->sns_strength = 25;
+ config->filter_sharpness = 6;
+ config->filter_strength = 10;
+ break;
+ case WEBP_PRESET_ICON:
+ config->sns_strength = 0;
+ config->filter_strength = 0; // disable filtering to retain sharpness
+ config->preprocessing &= ~2; // no dithering
+ break;
+ case WEBP_PRESET_TEXT:
+ config->sns_strength = 0;
+ config->filter_strength = 0; // disable filtering to retain sharpness
+ config->preprocessing &= ~2; // no dithering
+ config->segments = 2;
+ break;
+ case WEBP_PRESET_DEFAULT:
+ default:
+ break;
+ }
+ return WebPValidateConfig(config);
+}
+
+int WebPValidateConfig(const WebPConfig* config) {
+ if (config == NULL) return 0;
+ if (config->quality < 0 || config->quality > 100) return 0;
+ if (config->target_size < 0) return 0;
+ if (config->target_PSNR < 0) return 0;
+ if (config->method < 0 || config->method > 6) return 0;
+ if (config->segments < 1 || config->segments > 4) return 0;
+ if (config->sns_strength < 0 || config->sns_strength > 100) return 0;
+ if (config->filter_strength < 0 || config->filter_strength > 100) return 0;
+ if (config->filter_sharpness < 0 || config->filter_sharpness > 7) return 0;
+ if (config->filter_type < 0 || config->filter_type > 1) return 0;
+ if (config->autofilter < 0 || config->autofilter > 1) return 0;
+ if (config->pass < 1 || config->pass > 10) return 0;
+ if (config->qmin < 0 || config->qmax > 100 || config->qmin > config->qmax) {
+ return 0;
+ }
+ if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
+ if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
+ if (config->partitions < 0 || config->partitions > 3) return 0;
+ if (config->partition_limit < 0 || config->partition_limit > 100) return 0;
+ if (config->alpha_compression < 0) return 0;
+ if (config->alpha_filtering < 0) return 0;
+ if (config->alpha_quality < 0 || config->alpha_quality > 100) return 0;
+ if (config->lossless < 0 || config->lossless > 1) return 0;
+ if (config->near_lossless < 0 || config->near_lossless > 100) return 0;
+ if (config->image_hint >= WEBP_HINT_LAST) return 0;
+ if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1) return 0;
+ if (config->thread_level < 0 || config->thread_level > 1) return 0;
+ if (config->low_memory < 0 || config->low_memory > 1) return 0;
+ if (config->exact < 0 || config->exact > 1) return 0;
+ if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
+ return 0;
+ }
+ if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
+
+ return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#define MAX_LEVEL 9
+
+// Mapping between -z level and -m / -q parameter settings.
+static const struct {
+ uint8_t method_;
+ uint8_t quality_;
+} kLosslessPresets[MAX_LEVEL + 1] = {
+ { 0, 0 }, { 1, 20 }, { 2, 25 }, { 3, 30 }, { 3, 50 },
+ { 4, 50 }, { 4, 75 }, { 4, 90 }, { 5, 90 }, { 6, 100 }
+};
+
+int WebPConfigLosslessPreset(WebPConfig* config, int level) {
+ if (config == NULL || level < 0 || level > MAX_LEVEL) return 0;
+ config->lossless = 1;
+ config->method = kLosslessPresets[level].method_;
+ config->quality = kLosslessPresets[level].quality_;
+ return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/cost_enc.c b/media/libwebp/enc/cost_enc.c
new file mode 100644
index 0000000000..bb7fe64fa2
--- /dev/null
+++ b/media/libwebp/enc/cost_enc.c
@@ -0,0 +1,342 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Cost tables for level and modes
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../enc/cost_enc.h"
+
+//------------------------------------------------------------------------------
+// Level cost tables
+
+// For each given level, the following table gives the pattern of contexts to
+// use for coding it (in [][0]) as well as the bit value to use for each
+// context (in [][1]).
+const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
+ {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
+ {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
+ {0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},
+ {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013},
+ {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x093},
+ {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
+ {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
+ {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
+ {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x153, 0x053},
+ {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+ {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+ {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+ {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+ {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+ {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+ {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+ {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x153}
+};
+
+static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
+ int pattern = VP8LevelCodes[level - 1][0];
+ int bits = VP8LevelCodes[level - 1][1];
+ int cost = 0;
+ int i;
+ for (i = 2; pattern; ++i) {
+ if (pattern & 1) {
+ cost += VP8BitCost(bits & 1, probas[i]);
+ }
+ bits >>= 1;
+ pattern >>= 1;
+ }
+ return cost;
+}
+
+//------------------------------------------------------------------------------
+// Pre-calc level costs once for all
+
+void VP8CalculateLevelCosts(VP8EncProba* const proba) {
+ int ctype, band, ctx;
+
+ if (!proba->dirty_) return; // nothing to do.
+
+ for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
+ int n;
+ for (band = 0; band < NUM_BANDS; ++band) {
+ for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+ const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
+ uint16_t* const table = proba->level_cost_[ctype][band][ctx];
+ const int cost0 = (ctx > 0) ? VP8BitCost(1, p[0]) : 0;
+ const int cost_base = VP8BitCost(1, p[1]) + cost0;
+ int v;
+ table[0] = VP8BitCost(0, p[1]) + cost0;
+ for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) {
+ table[v] = cost_base + VariableLevelCost(v, p);
+ }
+ // Starting at level 67 and up, the variable part of the cost is
+ // actually constant.
+ }
+ }
+ for (n = 0; n < 16; ++n) { // replicate bands. We don't need to sentinel.
+ for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+ proba->remapped_costs_[ctype][n][ctx] =
+ proba->level_cost_[ctype][VP8EncBands[n]][ctx];
+ }
+ }
+ }
+ proba->dirty_ = 0;
+}
+
+//------------------------------------------------------------------------------
+// Mode cost tables.
+
+// These are the fixed probabilities (in the coding trees) turned into bit-cost
+// by calling VP8BitCost().
+const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
+// note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
+const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
+const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
+ { { 40, 1151, 1723, 1874, 2103, 2019, 1628, 1777, 2226, 2137 },
+ { 192, 469, 1296, 1308, 1849, 1794, 1781, 1703, 1713, 1522 },
+ { 142, 910, 762, 1684, 1849, 1576, 1460, 1305, 1801, 1657 },
+ { 559, 641, 1370, 421, 1182, 1569, 1612, 1725, 863, 1007 },
+ { 299, 1059, 1256, 1108, 636, 1068, 1581, 1883, 869, 1142 },
+ { 277, 1111, 707, 1362, 1089, 672, 1603, 1541, 1545, 1291 },
+ { 214, 781, 1609, 1303, 1632, 2229, 726, 1560, 1713, 918 },
+ { 152, 1037, 1046, 1759, 1983, 2174, 1358, 742, 1740, 1390 },
+ { 512, 1046, 1420, 753, 752, 1297, 1486, 1613, 460, 1207 },
+ { 424, 827, 1362, 719, 1462, 1202, 1199, 1476, 1199, 538 } },
+ { { 240, 402, 1134, 1491, 1659, 1505, 1517, 1555, 1979, 2099 },
+ { 467, 242, 960, 1232, 1714, 1620, 1834, 1570, 1676, 1391 },
+ { 500, 455, 463, 1507, 1699, 1282, 1564, 982, 2114, 2114 },
+ { 672, 643, 1372, 331, 1589, 1667, 1453, 1938, 996, 876 },
+ { 458, 783, 1037, 911, 738, 968, 1165, 1518, 859, 1033 },
+ { 504, 815, 504, 1139, 1219, 719, 1506, 1085, 1268, 1268 },
+ { 333, 630, 1445, 1239, 1883, 3672, 799, 1548, 1865, 598 },
+ { 399, 644, 746, 1342, 1856, 1350, 1493, 613, 1855, 1015 },
+ { 622, 749, 1205, 608, 1066, 1408, 1290, 1406, 546, 971 },
+ { 500, 753, 1041, 668, 1230, 1617, 1297, 1425, 1383, 523 } },
+ { { 394, 553, 523, 1502, 1536, 981, 1608, 1142, 1666, 2181 },
+ { 655, 430, 375, 1411, 1861, 1220, 1677, 1135, 1978, 1553 },
+ { 690, 640, 245, 1954, 2070, 1194, 1528, 982, 1972, 2232 },
+ { 559, 834, 741, 867, 1131, 980, 1225, 852, 1092, 784 },
+ { 690, 875, 516, 959, 673, 894, 1056, 1190, 1528, 1126 },
+ { 740, 951, 384, 1277, 1177, 492, 1579, 1155, 1846, 1513 },
+ { 323, 775, 1062, 1776, 3062, 1274, 813, 1188, 1372, 655 },
+ { 488, 971, 484, 1767, 1515, 1775, 1115, 503, 1539, 1461 },
+ { 740, 1006, 998, 709, 851, 1230, 1337, 788, 741, 721 },
+ { 522, 1073, 573, 1045, 1346, 887, 1046, 1146, 1203, 697 } },
+ { { 105, 864, 1442, 1009, 1934, 1840, 1519, 1920, 1673, 1579 },
+ { 534, 305, 1193, 683, 1388, 2164, 1802, 1894, 1264, 1170 },
+ { 305, 518, 877, 1108, 1426, 3215, 1425, 1064, 1320, 1242 },
+ { 683, 732, 1927, 257, 1493, 2048, 1858, 1552, 1055, 947 },
+ { 394, 814, 1024, 660, 959, 1556, 1282, 1289, 893, 1047 },
+ { 528, 615, 996, 940, 1201, 635, 1094, 2515, 803, 1358 },
+ { 347, 614, 1609, 1187, 3133, 1345, 1007, 1339, 1017, 667 },
+ { 218, 740, 878, 1605, 3650, 3650, 1345, 758, 1357, 1617 },
+ { 672, 750, 1541, 558, 1257, 1599, 1870, 2135, 402, 1087 },
+ { 592, 684, 1161, 430, 1092, 1497, 1475, 1489, 1095, 822 } },
+ { { 228, 1056, 1059, 1368, 752, 982, 1512, 1518, 987, 1782 },
+ { 494, 514, 818, 942, 965, 892, 1610, 1356, 1048, 1363 },
+ { 512, 648, 591, 1042, 761, 991, 1196, 1454, 1309, 1463 },
+ { 683, 749, 1043, 676, 841, 1396, 1133, 1138, 654, 939 },
+ { 622, 1101, 1126, 994, 361, 1077, 1203, 1318, 877, 1219 },
+ { 631, 1068, 857, 1650, 651, 477, 1650, 1419, 828, 1170 },
+ { 555, 727, 1068, 1335, 3127, 1339, 820, 1331, 1077, 429 },
+ { 504, 879, 624, 1398, 889, 889, 1392, 808, 891, 1406 },
+ { 683, 1602, 1289, 977, 578, 983, 1280, 1708, 406, 1122 },
+ { 399, 865, 1433, 1070, 1072, 764, 968, 1477, 1223, 678 } },
+ { { 333, 760, 935, 1638, 1010, 529, 1646, 1410, 1472, 2219 },
+ { 512, 494, 750, 1160, 1215, 610, 1870, 1868, 1628, 1169 },
+ { 572, 646, 492, 1934, 1208, 603, 1580, 1099, 1398, 1995 },
+ { 786, 789, 942, 581, 1018, 951, 1599, 1207, 731, 768 },
+ { 690, 1015, 672, 1078, 582, 504, 1693, 1438, 1108, 2897 },
+ { 768, 1267, 571, 2005, 1243, 244, 2881, 1380, 1786, 1453 },
+ { 452, 899, 1293, 903, 1311, 3100, 465, 1311, 1319, 813 },
+ { 394, 927, 942, 1103, 1358, 1104, 946, 593, 1363, 1109 },
+ { 559, 1005, 1007, 1016, 658, 1173, 1021, 1164, 623, 1028 },
+ { 564, 796, 632, 1005, 1014, 863, 2316, 1268, 938, 764 } },
+ { { 266, 606, 1098, 1228, 1497, 1243, 948, 1030, 1734, 1461 },
+ { 366, 585, 901, 1060, 1407, 1247, 876, 1134, 1620, 1054 },
+ { 452, 565, 542, 1729, 1479, 1479, 1016, 886, 2938, 1150 },
+ { 555, 1088, 1533, 950, 1354, 895, 834, 1019, 1021, 496 },
+ { 704, 815, 1193, 971, 973, 640, 1217, 2214, 832, 578 },
+ { 672, 1245, 579, 871, 875, 774, 872, 1273, 1027, 949 },
+ { 296, 1134, 2050, 1784, 1636, 3425, 442, 1550, 2076, 722 },
+ { 342, 982, 1259, 1846, 1848, 1848, 622, 568, 1847, 1052 },
+ { 555, 1064, 1304, 828, 746, 1343, 1075, 1329, 1078, 494 },
+ { 288, 1167, 1285, 1174, 1639, 1639, 833, 2254, 1304, 509 } },
+ { { 342, 719, 767, 1866, 1757, 1270, 1246, 550, 1746, 2151 },
+ { 483, 653, 694, 1509, 1459, 1410, 1218, 507, 1914, 1266 },
+ { 488, 757, 447, 2979, 1813, 1268, 1654, 539, 1849, 2109 },
+ { 522, 1097, 1085, 851, 1365, 1111, 851, 901, 961, 605 },
+ { 709, 716, 841, 728, 736, 945, 941, 862, 2845, 1057 },
+ { 512, 1323, 500, 1336, 1083, 681, 1342, 717, 1604, 1350 },
+ { 452, 1155, 1372, 1900, 1501, 3290, 311, 944, 1919, 922 },
+ { 403, 1520, 977, 2132, 1733, 3522, 1076, 276, 3335, 1547 },
+ { 559, 1374, 1101, 615, 673, 2462, 974, 795, 984, 984 },
+ { 547, 1122, 1062, 812, 1410, 951, 1140, 622, 1268, 651 } },
+ { { 165, 982, 1235, 938, 1334, 1366, 1659, 1578, 964, 1612 },
+ { 592, 422, 925, 847, 1139, 1112, 1387, 2036, 861, 1041 },
+ { 403, 837, 732, 770, 941, 1658, 1250, 809, 1407, 1407 },
+ { 896, 874, 1071, 381, 1568, 1722, 1437, 2192, 480, 1035 },
+ { 640, 1098, 1012, 1032, 684, 1382, 1581, 2106, 416, 865 },
+ { 559, 1005, 819, 914, 710, 770, 1418, 920, 838, 1435 },
+ { 415, 1258, 1245, 870, 1278, 3067, 770, 1021, 1287, 522 },
+ { 406, 990, 601, 1009, 1265, 1265, 1267, 759, 1017, 1277 },
+ { 968, 1182, 1329, 788, 1032, 1292, 1705, 1714, 203, 1403 },
+ { 732, 877, 1279, 471, 901, 1161, 1545, 1294, 755, 755 } },
+ { { 111, 931, 1378, 1185, 1933, 1648, 1148, 1714, 1873, 1307 },
+ { 406, 414, 1030, 1023, 1910, 1404, 1313, 1647, 1509, 793 },
+ { 342, 640, 575, 1088, 1241, 1349, 1161, 1350, 1756, 1502 },
+ { 559, 766, 1185, 357, 1682, 1428, 1329, 1897, 1219, 802 },
+ { 473, 909, 1164, 771, 719, 2508, 1427, 1432, 722, 782 },
+ { 342, 892, 785, 1145, 1150, 794, 1296, 1550, 973, 1057 },
+ { 208, 1036, 1326, 1343, 1606, 3395, 815, 1455, 1618, 712 },
+ { 228, 928, 890, 1046, 3499, 1711, 994, 829, 1720, 1318 },
+ { 768, 724, 1058, 636, 991, 1075, 1319, 1324, 616, 825 },
+ { 305, 1167, 1358, 899, 1587, 1587, 987, 1988, 1332, 501 } }
+};
+
+//------------------------------------------------------------------------------
+// helper functions for residuals struct VP8Residual.
+
+void VP8InitResidual(int first, int coeff_type,
+ VP8Encoder* const enc, VP8Residual* const res) {
+ res->coeff_type = coeff_type;
+ res->prob = enc->proba_.coeffs_[coeff_type];
+ res->stats = enc->proba_.stats_[coeff_type];
+ res->costs = enc->proba_.remapped_costs_[coeff_type];
+ res->first = first;
+}
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
+ const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
+ VP8Residual res;
+ VP8Encoder* const enc = it->enc_;
+ int R = 0;
+ int ctx;
+
+ VP8InitResidual(0, 3, enc, &res);
+ ctx = it->top_nz_[x] + it->left_nz_[y];
+ VP8SetResidualCoeffs(levels, &res);
+ R += VP8GetResidualCost(ctx, &res);
+ return R;
+}
+
+int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+ VP8Residual res;
+ VP8Encoder* const enc = it->enc_;
+ int x, y;
+ int R = 0;
+
+ VP8IteratorNzToBytes(it); // re-import the non-zero context
+
+ // DC
+ VP8InitResidual(0, 1, enc, &res);
+ VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+ R += VP8GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);
+
+ // AC
+ VP8InitResidual(1, 0, enc, &res);
+ for (y = 0; y < 4; ++y) {
+ for (x = 0; x < 4; ++x) {
+ const int ctx = it->top_nz_[x] + it->left_nz_[y];
+ VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+ R += VP8GetResidualCost(ctx, &res);
+ it->top_nz_[x] = it->left_nz_[y] = (res.last >= 0);
+ }
+ }
+ return R;
+}
+
+int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+ VP8Residual res;
+ VP8Encoder* const enc = it->enc_;
+ int ch, x, y;
+ int R = 0;
+
+ VP8IteratorNzToBytes(it); // re-import the non-zero context
+
+ VP8InitResidual(0, 2, enc, &res);
+ for (ch = 0; ch <= 2; ch += 2) {
+ for (y = 0; y < 2; ++y) {
+ for (x = 0; x < 2; ++x) {
+ const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+ VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+ R += VP8GetResidualCost(ctx, &res);
+ it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = (res.last >= 0);
+ }
+ }
+ }
+ return R;
+}
+
+
+//------------------------------------------------------------------------------
+// Recording of token probabilities.
+
+// We keep the table-free variant around for reference, in case.
+#define USE_LEVEL_CODE_TABLE
+
+// Simulate block coding, but only record statistics.
+// Note: no need to record the fixed probas.
+int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
+ int n = res->first;
+ // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+ proba_t* s = res->stats[n][ctx];
+ if (res->last < 0) {
+ VP8RecordStats(0, s + 0);
+ return 0;
+ }
+ while (n <= res->last) {
+ int v;
+ VP8RecordStats(1, s + 0); // order of record doesn't matter
+ while ((v = res->coeffs[n++]) == 0) {
+ VP8RecordStats(0, s + 1);
+ s = res->stats[VP8EncBands[n]][0];
+ }
+ VP8RecordStats(1, s + 1);
+ if (!VP8RecordStats(2u < (unsigned int)(v + 1), s + 2)) { // v = -1 or 1
+ s = res->stats[VP8EncBands[n]][1];
+ } else {
+ v = abs(v);
+#if !defined(USE_LEVEL_CODE_TABLE)
+ if (!VP8RecordStats(v > 4, s + 3)) {
+ if (VP8RecordStats(v != 2, s + 4))
+ VP8RecordStats(v == 4, s + 5);
+ } else if (!VP8RecordStats(v > 10, s + 6)) {
+ VP8RecordStats(v > 6, s + 7);
+ } else if (!VP8RecordStats((v >= 3 + (8 << 2)), s + 8)) {
+ VP8RecordStats((v >= 3 + (8 << 1)), s + 9);
+ } else {
+ VP8RecordStats((v >= 3 + (8 << 3)), s + 10);
+ }
+#else
+ if (v > MAX_VARIABLE_LEVEL) {
+ v = MAX_VARIABLE_LEVEL;
+ }
+
+ {
+ const int bits = VP8LevelCodes[v - 1][1];
+ int pattern = VP8LevelCodes[v - 1][0];
+ int i;
+ for (i = 0; (pattern >>= 1) != 0; ++i) {
+ const int mask = 2 << i;
+ if (pattern & 1) VP8RecordStats(!!(bits & mask), s + 3 + i);
+ }
+ }
+#endif
+ s = res->stats[VP8EncBands[n]][2];
+ }
+ }
+ if (n < 16) VP8RecordStats(0, s + 0);
+ return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/delta_palettization_enc.h b/media/libwebp/enc/delta_palettization_enc.h
deleted file mode 100644
index 63048ec6e8..0000000000
--- a/media/libwebp/enc/delta_palettization_enc.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Mislav Bradac (mislavm@google.com)
-//
-
-#ifndef WEBP_ENC_DELTA_PALETTIZATION_H_
-#define WEBP_ENC_DELTA_PALETTIZATION_H_
-
-#include "../webp/encode.h"
-#include "../enc/vp8li_enc.h"
-
-// Replaces enc->argb_[] input by a palettizable approximation of it,
-// and generates optimal enc->palette_[].
-// This function can revert enc->use_palette_ / enc->use_predict_ flag
-// if delta-palettization is not producing expected saving.
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
-
-#endif // WEBP_ENC_DELTA_PALETTIZATION_H_
diff --git a/media/libwebp/enc/filter_enc.c b/media/libwebp/enc/filter_enc.c
new file mode 100644
index 0000000000..5ffc232626
--- /dev/null
+++ b/media/libwebp/enc/filter_enc.c
@@ -0,0 +1,235 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Selecting filter level
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include <assert.h>
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+
+// This table gives, for a given sharpness, the filtering strength to be
+// used (at least) in order to filter a given edge step delta.
+// This is constructed by brute force inspection: for all delta, we iterate
+// over all possible filtering strength / thresh until needs_filter() returns
+// true.
+#define MAX_DELTA_SIZE 64
+static const uint8_t kLevelsFromDelta[8][MAX_DELTA_SIZE] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
+ { 0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18,
+ 20, 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42,
+ 44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+ { 0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 19,
+ 20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43,
+ 44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+ { 0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19,
+ 21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
+ 45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+ { 0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 20,
+ 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44,
+ 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+ { 0, 1, 2, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20,
+ 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
+ 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+ { 0, 1, 2, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19, 21,
+ 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45,
+ 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+ { 0, 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21,
+ 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45,
+ 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }
+};
+
+int VP8FilterStrengthFromDelta(int sharpness, int delta) {
+ const int pos = (delta < MAX_DELTA_SIZE) ? delta : MAX_DELTA_SIZE - 1;
+ assert(sharpness >= 0 && sharpness <= 7);
+ return kLevelsFromDelta[sharpness][pos];
+}
+
+//------------------------------------------------------------------------------
+// Paragraph 15.4: compute the inner-edge filtering strength
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+static int GetILevel(int sharpness, int level) {
+ if (sharpness > 0) {
+ if (sharpness > 4) {
+ level >>= 2;
+ } else {
+ level >>= 1;
+ }
+ if (level > 9 - sharpness) {
+ level = 9 - sharpness;
+ }
+ }
+ if (level < 1) level = 1;
+ return level;
+}
+
+static void DoFilter(const VP8EncIterator* const it, int level) {
+ const VP8Encoder* const enc = it->enc_;
+ const int ilevel = GetILevel(enc->config_->filter_sharpness, level);
+ const int limit = 2 * level + ilevel;
+
+ uint8_t* const y_dst = it->yuv_out2_ + Y_OFF_ENC;
+ uint8_t* const u_dst = it->yuv_out2_ + U_OFF_ENC;
+ uint8_t* const v_dst = it->yuv_out2_ + V_OFF_ENC;
+
+ // copy current block to yuv_out2_
+ memcpy(y_dst, it->yuv_out_, YUV_SIZE_ENC * sizeof(uint8_t));
+
+ if (enc->filter_hdr_.simple_ == 1) { // simple
+ VP8SimpleHFilter16i(y_dst, BPS, limit);
+ VP8SimpleVFilter16i(y_dst, BPS, limit);
+ } else { // complex
+ const int hev_thresh = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+ VP8HFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+ VP8HFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+ VP8VFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+ VP8VFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SSIM metric for one macroblock
+
+static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
+ int x, y;
+ double sum = 0.;
+
+ // compute SSIM in a 10 x 10 window
+ for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) {
+ for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) {
+ sum += VP8SSIMGetClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+ x, y, 16, 16);
+ }
+ }
+ for (x = 1; x < 7; x++) {
+ for (y = 1; y < 7; y++) {
+ sum += VP8SSIMGetClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+ x, y, 8, 8);
+ sum += VP8SSIMGetClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+ x, y, 8, 8);
+ }
+ }
+ return sum;
+}
+
+#endif // !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+// Exposed APIs: Encoder should call the following 3 functions to adjust
+// loop filter strength
+
+void VP8InitFilter(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
+ if (it->lf_stats_ != NULL) {
+ int s, i;
+ for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+ for (i = 0; i < MAX_LF_LEVELS; i++) {
+ (*it->lf_stats_)[s][i] = 0;
+ }
+ }
+ VP8SSIMDspInit();
+ }
+#else
+ (void)it;
+#endif
+}
+
+void VP8StoreFilterStats(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
+ int d;
+ VP8Encoder* const enc = it->enc_;
+ const int s = it->mb_->segment_;
+ const int level0 = enc->dqm_[s].fstrength_;
+
+ // explore +/-quant range of values around level0
+ const int delta_min = -enc->dqm_[s].quant_;
+ const int delta_max = enc->dqm_[s].quant_;
+ const int step_size = (delta_max - delta_min >= 4) ? 4 : 1;
+
+ if (it->lf_stats_ == NULL) return;
+
+ // NOTE: Currently we are applying filter only across the sublock edges
+ // There are two reasons for that.
+ // 1. Applying filter on macro block edges will change the pixels in
+ // the left and top macro blocks. That will be hard to restore
+ // 2. Macro Blocks on the bottom and right are not yet compressed. So we
+ // cannot apply filter on the right and bottom macro block edges.
+ if (it->mb_->type_ == 1 && it->mb_->skip_) return;
+
+ // Always try filter level zero
+ (*it->lf_stats_)[s][0] += GetMBSSIM(it->yuv_in_, it->yuv_out_);
+
+ for (d = delta_min; d <= delta_max; d += step_size) {
+ const int level = level0 + d;
+ if (level <= 0 || level >= MAX_LF_LEVELS) {
+ continue;
+ }
+ DoFilter(it, level);
+ (*it->lf_stats_)[s][level] += GetMBSSIM(it->yuv_in_, it->yuv_out2_);
+ }
+#else // defined(WEBP_REDUCE_SIZE)
+ (void)it;
+#endif // !defined(WEBP_REDUCE_SIZE)
+}
+
+void VP8AdjustFilterStrength(VP8EncIterator* const it) {
+ VP8Encoder* const enc = it->enc_;
+#if !defined(WEBP_REDUCE_SIZE)
+ if (it->lf_stats_ != NULL) {
+ int s;
+ for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+ int i, best_level = 0;
+ // Improvement over filter level 0 should be at least 1e-5 (relatively)
+ double best_v = 1.00001 * (*it->lf_stats_)[s][0];
+ for (i = 1; i < MAX_LF_LEVELS; i++) {
+ const double v = (*it->lf_stats_)[s][i];
+ if (v > best_v) {
+ best_v = v;
+ best_level = i;
+ }
+ }
+ enc->dqm_[s].fstrength_ = best_level;
+ }
+ return;
+ }
+#endif // !defined(WEBP_REDUCE_SIZE)
+ if (enc->config_->filter_strength > 0) {
+ int max_level = 0;
+ int s;
+ for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+ VP8SegmentInfo* const dqm = &enc->dqm_[s];
+ // this '>> 3' accounts for some inverse WHT scaling
+ const int delta = (dqm->max_edge_ * dqm->y2_.q_[1]) >> 3;
+ const int level =
+ VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, delta);
+ if (level > dqm->fstrength_) {
+ dqm->fstrength_ = level;
+ }
+ if (max_level < dqm->fstrength_) {
+ max_level = dqm->fstrength_;
+ }
+ }
+ enc->filter_hdr_.level_ = max_level;
+ }
+}
+
+// -----------------------------------------------------------------------------
diff --git a/media/libwebp/enc/frame_enc.c b/media/libwebp/enc/frame_enc.c
new file mode 100644
index 0000000000..c8698ca5b5
--- /dev/null
+++ b/media/libwebp/enc/frame_enc.c
@@ -0,0 +1,899 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// frame coding and analysis
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <string.h>
+#include <math.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+#include "../webp/format_constants.h" // RIFF constants
+
+#define SEGMENT_VISU 0
+#define DEBUG_SEARCH 0 // useful to track search convergence
+
+//------------------------------------------------------------------------------
+// multi-pass convergence
+
+#define HEADER_SIZE_ESTIMATE (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + \
+ VP8_FRAME_HEADER_SIZE)
+#define DQ_LIMIT 0.4 // convergence is considered reached if dq < DQ_LIMIT
+// we allow 2k of extra head-room in PARTITION0 limit.
+#define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
+
+static float Clamp(float v, float min, float max) {
+ return (v < min) ? min : (v > max) ? max : v;
+}
+
+typedef struct { // struct for organizing convergence in either size or PSNR
+ int is_first;
+ float dq;
+ float q, last_q;
+ float qmin, qmax;
+ double value, last_value; // PSNR or size
+ double target;
+ int do_size_search;
+} PassStats;
+
+static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
+ const uint64_t target_size = (uint64_t)enc->config_->target_size;
+ const int do_size_search = (target_size != 0);
+ const float target_PSNR = enc->config_->target_PSNR;
+
+ s->is_first = 1;
+ s->dq = 10.f;
+ s->qmin = 1.f * enc->config_->qmin;
+ s->qmax = 1.f * enc->config_->qmax;
+ s->q = s->last_q = Clamp(enc->config_->quality, s->qmin, s->qmax);
+ s->target = do_size_search ? (double)target_size
+ : (target_PSNR > 0.) ? target_PSNR
+ : 40.; // default, just in case
+ s->value = s->last_value = 0.;
+ s->do_size_search = do_size_search;
+ return do_size_search;
+}
+
+static float ComputeNextQ(PassStats* const s) {
+ float dq;
+ if (s->is_first) {
+ dq = (s->value > s->target) ? -s->dq : s->dq;
+ s->is_first = 0;
+ } else if (s->value != s->last_value) {
+ const double slope = (s->target - s->value) / (s->last_value - s->value);
+ dq = (float)(slope * (s->last_q - s->q));
+ } else {
+ dq = 0.; // we're done?!
+ }
+ // Limit variable to avoid large swings.
+ s->dq = Clamp(dq, -30.f, 30.f);
+ s->last_q = s->q;
+ s->last_value = s->value;
+ s->q = Clamp(s->q + s->dq, s->qmin, s->qmax);
+ return s->q;
+}
+
+//------------------------------------------------------------------------------
+// Tables for level coding
+
+const uint8_t VP8Cat3[] = { 173, 148, 140 };
+const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
+const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
+const uint8_t VP8Cat6[] =
+ { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
+
+//------------------------------------------------------------------------------
+// Reset the statistics about: number of skips, token proba, level cost,...
+
+static void ResetStats(VP8Encoder* const enc) {
+ VP8EncProba* const proba = &enc->proba_;
+ VP8CalculateLevelCosts(proba);
+ proba->nb_skip_ = 0;
+}
+
+//------------------------------------------------------------------------------
+// Skip decision probability
+
+#define SKIP_PROBA_THRESHOLD 250 // value below which using skip_proba is OK.
+
+static int CalcSkipProba(uint64_t nb, uint64_t total) {
+ return (int)(total ? (total - nb) * 255 / total : 255);
+}
+
+// Returns the bit-cost for coding the skip probability.
+static int FinalizeSkipProba(VP8Encoder* const enc) {
+ VP8EncProba* const proba = &enc->proba_;
+ const int nb_mbs = enc->mb_w_ * enc->mb_h_;
+ const int nb_events = proba->nb_skip_;
+ int size;
+ proba->skip_proba_ = CalcSkipProba(nb_events, nb_mbs);
+ proba->use_skip_proba_ = (proba->skip_proba_ < SKIP_PROBA_THRESHOLD);
+ size = 256; // 'use_skip_proba' bit
+ if (proba->use_skip_proba_) {
+ size += nb_events * VP8BitCost(1, proba->skip_proba_)
+ + (nb_mbs - nb_events) * VP8BitCost(0, proba->skip_proba_);
+ size += 8 * 256; // cost of signaling the skip_proba_ itself.
+ }
+ return size;
+}
+
+// Collect statistics and deduce probabilities for next coding pass.
+// Return the total bit-cost for coding the probability updates.
+static int CalcTokenProba(int nb, int total) {
+ assert(nb <= total);
+ return nb ? (255 - nb * 255 / total) : 255;
+}
+
+// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
+static int BranchCost(int nb, int total, int proba) {
+ return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
+}
+
+static void ResetTokenStats(VP8Encoder* const enc) {
+ VP8EncProba* const proba = &enc->proba_;
+ memset(proba->stats_, 0, sizeof(proba->stats_));
+}
+
+static int FinalizeTokenProbas(VP8EncProba* const proba) {
+ int has_changed = 0;
+ int size = 0;
+ int t, b, c, p;
+ for (t = 0; t < NUM_TYPES; ++t) {
+ for (b = 0; b < NUM_BANDS; ++b) {
+ for (c = 0; c < NUM_CTX; ++c) {
+ for (p = 0; p < NUM_PROBAS; ++p) {
+ const proba_t stats = proba->stats_[t][b][c][p];
+ const int nb = (stats >> 0) & 0xffff;
+ const int total = (stats >> 16) & 0xffff;
+ const int update_proba = VP8CoeffsUpdateProba[t][b][c][p];
+ const int old_p = VP8CoeffsProba0[t][b][c][p];
+ const int new_p = CalcTokenProba(nb, total);
+ const int old_cost = BranchCost(nb, total, old_p)
+ + VP8BitCost(0, update_proba);
+ const int new_cost = BranchCost(nb, total, new_p)
+ + VP8BitCost(1, update_proba)
+ + 8 * 256;
+ const int use_new_p = (old_cost > new_cost);
+ size += VP8BitCost(use_new_p, update_proba);
+ if (use_new_p) { // only use proba that seem meaningful enough.
+ proba->coeffs_[t][b][c][p] = new_p;
+ has_changed |= (new_p != old_p);
+ size += 8 * 256;
+ } else {
+ proba->coeffs_[t][b][c][p] = old_p;
+ }
+ }
+ }
+ }
+ }
+ proba->dirty_ = has_changed;
+ return size;
+}
+
+//------------------------------------------------------------------------------
+// Finalize Segment probability based on the coding tree
+
+static int GetProba(int a, int b) {
+ const int total = a + b;
+ return (total == 0) ? 255 // that's the default probability.
+ : (255 * a + total / 2) / total; // rounded proba
+}
+
+static void ResetSegments(VP8Encoder* const enc) {
+ int n;
+ for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+ enc->mb_info_[n].segment_ = 0;
+ }
+}
+
+static void SetSegmentProbas(VP8Encoder* const enc) {
+ int p[NUM_MB_SEGMENTS] = { 0 };
+ int n;
+
+ for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+ const VP8MBInfo* const mb = &enc->mb_info_[n];
+ ++p[mb->segment_];
+ }
+#if !defined(WEBP_DISABLE_STATS)
+ if (enc->pic_->stats != NULL) {
+ for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+ enc->pic_->stats->segment_size[n] = p[n];
+ }
+ }
+#endif
+ if (enc->segment_hdr_.num_segments_ > 1) {
+ uint8_t* const probas = enc->proba_.segments_;
+ probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
+ probas[1] = GetProba(p[0], p[1]);
+ probas[2] = GetProba(p[2], p[3]);
+
+ enc->segment_hdr_.update_map_ =
+ (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+ if (!enc->segment_hdr_.update_map_) ResetSegments(enc);
+ enc->segment_hdr_.size_ =
+ p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
+ p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
+ p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
+ p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
+ } else {
+ enc->segment_hdr_.update_map_ = 0;
+ enc->segment_hdr_.size_ = 0;
+ }
+}
+
+//------------------------------------------------------------------------------
+// Coefficient coding
+
+static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
+ int n = res->first;
+ // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+ const uint8_t* p = res->prob[n][ctx];
+ if (!VP8PutBit(bw, res->last >= 0, p[0])) {
+ return 0;
+ }
+
+ while (n < 16) {
+ const int c = res->coeffs[n++];
+ const int sign = c < 0;
+ int v = sign ? -c : c;
+ if (!VP8PutBit(bw, v != 0, p[1])) {
+ p = res->prob[VP8EncBands[n]][0];
+ continue;
+ }
+ if (!VP8PutBit(bw, v > 1, p[2])) {
+ p = res->prob[VP8EncBands[n]][1];
+ } else {
+ if (!VP8PutBit(bw, v > 4, p[3])) {
+ if (VP8PutBit(bw, v != 2, p[4])) {
+ VP8PutBit(bw, v == 4, p[5]);
+ }
+ } else if (!VP8PutBit(bw, v > 10, p[6])) {
+ if (!VP8PutBit(bw, v > 6, p[7])) {
+ VP8PutBit(bw, v == 6, 159);
+ } else {
+ VP8PutBit(bw, v >= 9, 165);
+ VP8PutBit(bw, !(v & 1), 145);
+ }
+ } else {
+ int mask;
+ const uint8_t* tab;
+ if (v < 3 + (8 << 1)) { // VP8Cat3 (3b)
+ VP8PutBit(bw, 0, p[8]);
+ VP8PutBit(bw, 0, p[9]);
+ v -= 3 + (8 << 0);
+ mask = 1 << 2;
+ tab = VP8Cat3;
+ } else if (v < 3 + (8 << 2)) { // VP8Cat4 (4b)
+ VP8PutBit(bw, 0, p[8]);
+ VP8PutBit(bw, 1, p[9]);
+ v -= 3 + (8 << 1);
+ mask = 1 << 3;
+ tab = VP8Cat4;
+ } else if (v < 3 + (8 << 3)) { // VP8Cat5 (5b)
+ VP8PutBit(bw, 1, p[8]);
+ VP8PutBit(bw, 0, p[10]);
+ v -= 3 + (8 << 2);
+ mask = 1 << 4;
+ tab = VP8Cat5;
+ } else { // VP8Cat6 (11b)
+ VP8PutBit(bw, 1, p[8]);
+ VP8PutBit(bw, 1, p[10]);
+ v -= 3 + (8 << 3);
+ mask = 1 << 10;
+ tab = VP8Cat6;
+ }
+ while (mask) {
+ VP8PutBit(bw, !!(v & mask), *tab++);
+ mask >>= 1;
+ }
+ }
+ p = res->prob[VP8EncBands[n]][2];
+ }
+ VP8PutBitUniform(bw, sign);
+ if (n == 16 || !VP8PutBit(bw, n <= res->last, p[0])) {
+ return 1; // EOB
+ }
+ }
+ return 1;
+}
+
+static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
+ const VP8ModeScore* const rd) {
+ int x, y, ch;
+ VP8Residual res;
+ uint64_t pos1, pos2, pos3;
+ const int i16 = (it->mb_->type_ == 1);
+ const int segment = it->mb_->segment_;
+ VP8Encoder* const enc = it->enc_;
+
+ VP8IteratorNzToBytes(it);
+
+ pos1 = VP8BitWriterPos(bw);
+ if (i16) {
+ VP8InitResidual(0, 1, enc, &res);
+ VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+ it->top_nz_[8] = it->left_nz_[8] =
+ PutCoeffs(bw, it->top_nz_[8] + it->left_nz_[8], &res);
+ VP8InitResidual(1, 0, enc, &res);
+ } else {
+ VP8InitResidual(0, 3, enc, &res);
+ }
+
+ // luma-AC
+ for (y = 0; y < 4; ++y) {
+ for (x = 0; x < 4; ++x) {
+ const int ctx = it->top_nz_[x] + it->left_nz_[y];
+ VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+ it->top_nz_[x] = it->left_nz_[y] = PutCoeffs(bw, ctx, &res);
+ }
+ }
+ pos2 = VP8BitWriterPos(bw);
+
+ // U/V
+ VP8InitResidual(0, 2, enc, &res);
+ for (ch = 0; ch <= 2; ch += 2) {
+ for (y = 0; y < 2; ++y) {
+ for (x = 0; x < 2; ++x) {
+ const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+ VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+ it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+ PutCoeffs(bw, ctx, &res);
+ }
+ }
+ }
+ pos3 = VP8BitWriterPos(bw);
+ it->luma_bits_ = pos2 - pos1;
+ it->uv_bits_ = pos3 - pos2;
+ it->bit_count_[segment][i16] += it->luma_bits_;
+ it->bit_count_[segment][2] += it->uv_bits_;
+ VP8IteratorBytesToNz(it);
+}
+
+// Same as CodeResiduals, but doesn't actually write anything.
+// Instead, it just records the event distribution.
+static void RecordResiduals(VP8EncIterator* const it,
+ const VP8ModeScore* const rd) {
+ int x, y, ch;
+ VP8Residual res;
+ VP8Encoder* const enc = it->enc_;
+
+ VP8IteratorNzToBytes(it);
+
+ if (it->mb_->type_ == 1) { // i16x16
+ VP8InitResidual(0, 1, enc, &res);
+ VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+ it->top_nz_[8] = it->left_nz_[8] =
+ VP8RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
+ VP8InitResidual(1, 0, enc, &res);
+ } else {
+ VP8InitResidual(0, 3, enc, &res);
+ }
+
+ // luma-AC
+ for (y = 0; y < 4; ++y) {
+ for (x = 0; x < 4; ++x) {
+ const int ctx = it->top_nz_[x] + it->left_nz_[y];
+ VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+ it->top_nz_[x] = it->left_nz_[y] = VP8RecordCoeffs(ctx, &res);
+ }
+ }
+
+ // U/V
+ VP8InitResidual(0, 2, enc, &res);
+ for (ch = 0; ch <= 2; ch += 2) {
+ for (y = 0; y < 2; ++y) {
+ for (x = 0; x < 2; ++x) {
+ const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+ VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+ it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+ VP8RecordCoeffs(ctx, &res);
+ }
+ }
+ }
+
+ VP8IteratorBytesToNz(it);
+}
+
+//------------------------------------------------------------------------------
+// Token buffer
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
+ VP8TBuffer* const tokens) {
+ int x, y, ch;
+ VP8Residual res;
+ VP8Encoder* const enc = it->enc_;
+
+ VP8IteratorNzToBytes(it);
+ if (it->mb_->type_ == 1) { // i16x16
+ const int ctx = it->top_nz_[8] + it->left_nz_[8];
+ VP8InitResidual(0, 1, enc, &res);
+ VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+ it->top_nz_[8] = it->left_nz_[8] =
+ VP8RecordCoeffTokens(ctx, &res, tokens);
+ VP8InitResidual(1, 0, enc, &res);
+ } else {
+ VP8InitResidual(0, 3, enc, &res);
+ }
+
+ // luma-AC
+ for (y = 0; y < 4; ++y) {
+ for (x = 0; x < 4; ++x) {
+ const int ctx = it->top_nz_[x] + it->left_nz_[y];
+ VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+ it->top_nz_[x] = it->left_nz_[y] =
+ VP8RecordCoeffTokens(ctx, &res, tokens);
+ }
+ }
+
+ // U/V
+ VP8InitResidual(0, 2, enc, &res);
+ for (ch = 0; ch <= 2; ch += 2) {
+ for (y = 0; y < 2; ++y) {
+ for (x = 0; x < 2; ++x) {
+ const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+ VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+ it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+ VP8RecordCoeffTokens(ctx, &res, tokens);
+ }
+ }
+ }
+ VP8IteratorBytesToNz(it);
+ return !tokens->error_;
+}
+
+#endif // !DISABLE_TOKEN_BUFFER
+
+//------------------------------------------------------------------------------
+// ExtraInfo map / Debug function
+
+#if !defined(WEBP_DISABLE_STATS)
+
+#if SEGMENT_VISU
+static void SetBlock(uint8_t* p, int value, int size) {
+ int y;
+ for (y = 0; y < size; ++y) {
+ memset(p, value, size);
+ p += BPS;
+ }
+}
+#endif
+
+static void ResetSSE(VP8Encoder* const enc) {
+ enc->sse_[0] = 0;
+ enc->sse_[1] = 0;
+ enc->sse_[2] = 0;
+ // Note: enc->sse_[3] is managed by alpha.c
+ enc->sse_count_ = 0;
+}
+
+static void StoreSSE(const VP8EncIterator* const it) {
+ VP8Encoder* const enc = it->enc_;
+ const uint8_t* const in = it->yuv_in_;
+ const uint8_t* const out = it->yuv_out_;
+ // Note: not totally accurate at boundary. And doesn't include in-loop filter.
+ enc->sse_[0] += VP8SSE16x16(in + Y_OFF_ENC, out + Y_OFF_ENC);
+ enc->sse_[1] += VP8SSE8x8(in + U_OFF_ENC, out + U_OFF_ENC);
+ enc->sse_[2] += VP8SSE8x8(in + V_OFF_ENC, out + V_OFF_ENC);
+ enc->sse_count_ += 16 * 16;
+}
+
+static void StoreSideInfo(const VP8EncIterator* const it) {
+ VP8Encoder* const enc = it->enc_;
+ const VP8MBInfo* const mb = it->mb_;
+ WebPPicture* const pic = enc->pic_;
+
+ if (pic->stats != NULL) {
+ StoreSSE(it);
+ enc->block_count_[0] += (mb->type_ == 0);
+ enc->block_count_[1] += (mb->type_ == 1);
+ enc->block_count_[2] += (mb->skip_ != 0);
+ }
+
+ if (pic->extra_info != NULL) {
+ uint8_t* const info = &pic->extra_info[it->x_ + it->y_ * enc->mb_w_];
+ switch (pic->extra_info_type) {
+ case 1: *info = mb->type_; break;
+ case 2: *info = mb->segment_; break;
+ case 3: *info = enc->dqm_[mb->segment_].quant_; break;
+ case 4: *info = (mb->type_ == 1) ? it->preds_[0] : 0xff; break;
+ case 5: *info = mb->uv_mode_; break;
+ case 6: {
+ const int b = (int)((it->luma_bits_ + it->uv_bits_ + 7) >> 3);
+ *info = (b > 255) ? 255 : b; break;
+ }
+ case 7: *info = mb->alpha_; break;
+ default: *info = 0; break;
+ }
+ }
+#if SEGMENT_VISU // visualize segments and prediction modes
+ SetBlock(it->yuv_out_ + Y_OFF_ENC, mb->segment_ * 64, 16);
+ SetBlock(it->yuv_out_ + U_OFF_ENC, it->preds_[0] * 64, 8);
+ SetBlock(it->yuv_out_ + V_OFF_ENC, mb->uv_mode_ * 64, 8);
+#endif
+}
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+ VP8Encoder* const enc = it->enc_;
+ WebPPicture* const pic = enc->pic_;
+ if (pic->stats != NULL) {
+ memset(enc->block_count_, 0, sizeof(enc->block_count_));
+ }
+ ResetSSE(enc);
+}
+#else // defined(WEBP_DISABLE_STATS)
+static void ResetSSE(VP8Encoder* const enc) {
+ (void)enc;
+}
+static void StoreSideInfo(const VP8EncIterator* const it) {
+ VP8Encoder* const enc = it->enc_;
+ WebPPicture* const pic = enc->pic_;
+ if (pic->extra_info != NULL) {
+ if (it->x_ == 0 && it->y_ == 0) { // only do it once, at start
+ memset(pic->extra_info, 0,
+ enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+ }
+ }
+}
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+ (void)it;
+}
+#endif // !defined(WEBP_DISABLE_STATS)
+
+static double GetPSNR(uint64_t mse, uint64_t size) {
+ return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
+}
+
+//------------------------------------------------------------------------------
+// StatLoop(): only collect statistics (number of skips, token usage, ...).
+// This is used for deciding optimal probabilities. It also modifies the
+// quantizer value if some target (size, PSNR) was specified.
+
+static void SetLoopParams(VP8Encoder* const enc, float q) {
+ // Make sure the quality parameter is inside valid bounds
+ q = Clamp(q, 0.f, 100.f);
+
+ VP8SetSegmentParams(enc, q); // setup segment quantizations and filters
+ SetSegmentProbas(enc); // compute segment probabilities
+
+ ResetStats(enc);
+ ResetSSE(enc);
+}
+
+static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
+ int nb_mbs, int percent_delta,
+ PassStats* const s) {
+ VP8EncIterator it;
+ uint64_t size = 0;
+ uint64_t size_p0 = 0;
+ uint64_t distortion = 0;
+ const uint64_t pixel_count = nb_mbs * 384;
+
+ VP8IteratorInit(enc, &it);
+ SetLoopParams(enc, s->q);
+ do {
+ VP8ModeScore info;
+ VP8IteratorImport(&it, NULL);
+ if (VP8Decimate(&it, &info, rd_opt)) {
+ // Just record the number of skips and act like skip_proba is not used.
+ ++enc->proba_.nb_skip_;
+ }
+ RecordResiduals(&it, &info);
+ size += info.R + info.H;
+ size_p0 += info.H;
+ distortion += info.D;
+ if (percent_delta && !VP8IteratorProgress(&it, percent_delta)) {
+ return 0;
+ }
+ VP8IteratorSaveBoundary(&it);
+ } while (VP8IteratorNext(&it) && --nb_mbs > 0);
+
+ size_p0 += enc->segment_hdr_.size_;
+ if (s->do_size_search) {
+ size += FinalizeSkipProba(enc);
+ size += FinalizeTokenProbas(&enc->proba_);
+ size = ((size + size_p0 + 1024) >> 11) + HEADER_SIZE_ESTIMATE;
+ s->value = (double)size;
+ } else {
+ s->value = GetPSNR(distortion, pixel_count);
+ }
+ return size_p0;
+}
+
+static int StatLoop(VP8Encoder* const enc) {
+ const int method = enc->method_;
+ const int do_search = enc->do_search_;
+ const int fast_probe = ((method == 0 || method == 3) && !do_search);
+ int num_pass_left = enc->config_->pass;
+ const int task_percent = 20;
+ const int percent_per_pass =
+ (task_percent + num_pass_left / 2) / num_pass_left;
+ const int final_percent = enc->percent_ + task_percent;
+ const VP8RDLevel rd_opt =
+ (method >= 3 || do_search) ? RD_OPT_BASIC : RD_OPT_NONE;
+ int nb_mbs = enc->mb_w_ * enc->mb_h_;
+ PassStats stats;
+
+ InitPassStats(enc, &stats);
+ ResetTokenStats(enc);
+
+ // Fast mode: quick analysis pass over few mbs. Better than nothing.
+ if (fast_probe) {
+ if (method == 3) { // we need more stats for method 3 to be reliable.
+ nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
+ } else {
+ nb_mbs = (nb_mbs > 200) ? nb_mbs >> 2 : 50;
+ }
+ }
+
+ while (num_pass_left-- > 0) {
+ const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+ (num_pass_left == 0) ||
+ (enc->max_i4_header_bits_ == 0);
+ const uint64_t size_p0 =
+ OneStatPass(enc, rd_opt, nb_mbs, percent_per_pass, &stats);
+ if (size_p0 == 0) return 0;
+#if (DEBUG_SEARCH > 0)
+ printf("#%d value:%.1lf -> %.1lf q:%.2f -> %.2f\n",
+ num_pass_left, stats.last_value, stats.value, stats.last_q, stats.q);
+#endif
+ if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
+ ++num_pass_left;
+ enc->max_i4_header_bits_ >>= 1; // strengthen header bit limitation...
+ continue; // ...and start over
+ }
+ if (is_last_pass) {
+ break;
+ }
+ // If no target size: just do several pass without changing 'q'
+ if (do_search) {
+ ComputeNextQ(&stats);
+ if (fabs(stats.dq) <= DQ_LIMIT) break;
+ }
+ }
+ if (!do_search || !stats.do_size_search) {
+ // Need to finalize probas now, since it wasn't done during the search.
+ FinalizeSkipProba(enc);
+ FinalizeTokenProbas(&enc->proba_);
+ }
+ VP8CalculateLevelCosts(&enc->proba_); // finalize costs
+ return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
+}
+
+//------------------------------------------------------------------------------
+// Main loops
+//
+
+static const uint8_t kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
+
+static int PreLoopInitialize(VP8Encoder* const enc) {
+ int p;
+ int ok = 1;
+ const int average_bytes_per_MB = kAverageBytesPerMB[enc->base_quant_ >> 4];
+ const int bytes_per_parts =
+ enc->mb_w_ * enc->mb_h_ * average_bytes_per_MB / enc->num_parts_;
+ // Initialize the bit-writers
+ for (p = 0; ok && p < enc->num_parts_; ++p) {
+ ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
+ }
+ if (!ok) {
+ VP8EncFreeBitWriters(enc); // malloc error occurred
+ WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+ }
+ return ok;
+}
+
+static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
+ VP8Encoder* const enc = it->enc_;
+ if (ok) { // Finalize the partitions, check for extra errors.
+ int p;
+ for (p = 0; p < enc->num_parts_; ++p) {
+ VP8BitWriterFinish(enc->parts_ + p);
+ ok &= !enc->parts_[p].error_;
+ }
+ }
+
+ if (ok) { // All good. Finish up.
+#if !defined(WEBP_DISABLE_STATS)
+ if (enc->pic_->stats != NULL) { // finalize byte counters...
+ int i, s;
+ for (i = 0; i <= 2; ++i) {
+ for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+ enc->residual_bytes_[i][s] = (int)((it->bit_count_[s][i] + 7) >> 3);
+ }
+ }
+ }
+#endif
+ VP8AdjustFilterStrength(it); // ...and store filter stats.
+ } else {
+ // Something bad happened -> need to do some memory cleanup.
+ VP8EncFreeBitWriters(enc);
+ }
+ return ok;
+}
+
+//------------------------------------------------------------------------------
+// VP8EncLoop(): does the final bitstream coding.
+
+static void ResetAfterSkip(VP8EncIterator* const it) {
+ if (it->mb_->type_ == 1) {
+ *it->nz_ = 0; // reset all predictors
+ it->left_nz_[8] = 0;
+ } else {
+ *it->nz_ &= (1 << 24); // preserve the dc_nz bit
+ }
+}
+
+int VP8EncLoop(VP8Encoder* const enc) {
+ VP8EncIterator it;
+ int ok = PreLoopInitialize(enc);
+ if (!ok) return 0;
+
+ StatLoop(enc); // stats-collection loop
+
+ VP8IteratorInit(enc, &it);
+ VP8InitFilter(&it);
+ do {
+ VP8ModeScore info;
+ const int dont_use_skip = !enc->proba_.use_skip_proba_;
+ const VP8RDLevel rd_opt = enc->rd_opt_level_;
+
+ VP8IteratorImport(&it, NULL);
+ // Warning! order is important: first call VP8Decimate() and
+ // *then* decide how to code the skip decision if there's one.
+ if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
+ CodeResiduals(it.bw_, &it, &info);
+ } else { // reset predictors after a skip
+ ResetAfterSkip(&it);
+ }
+ StoreSideInfo(&it);
+ VP8StoreFilterStats(&it);
+ VP8IteratorExport(&it);
+ ok = VP8IteratorProgress(&it, 20);
+ VP8IteratorSaveBoundary(&it);
+ } while (ok && VP8IteratorNext(&it));
+
+ return PostLoopFinalize(&it, ok);
+}
+
+//------------------------------------------------------------------------------
+// Single pass using Token Buffer.
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+#define MIN_COUNT 96 // minimum number of macroblocks before updating stats
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+ // Roughly refresh the proba eight times per pass
+ int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
+ int num_pass_left = enc->config_->pass;
+ int remaining_progress = 40; // percents
+ const int do_search = enc->do_search_;
+ VP8EncIterator it;
+ VP8EncProba* const proba = &enc->proba_;
+ const VP8RDLevel rd_opt = enc->rd_opt_level_;
+ const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
+ PassStats stats;
+ int ok;
+
+ InitPassStats(enc, &stats);
+ ok = PreLoopInitialize(enc);
+ if (!ok) return 0;
+
+ if (max_count < MIN_COUNT) max_count = MIN_COUNT;
+
+ assert(enc->num_parts_ == 1);
+ assert(enc->use_tokens_);
+ assert(proba->use_skip_proba_ == 0);
+ assert(rd_opt >= RD_OPT_BASIC); // otherwise, token-buffer won't be useful
+ assert(num_pass_left > 0);
+
+ while (ok && num_pass_left-- > 0) {
+ const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+ (num_pass_left == 0) ||
+ (enc->max_i4_header_bits_ == 0);
+ uint64_t size_p0 = 0;
+ uint64_t distortion = 0;
+ int cnt = max_count;
+ // The final number of passes is not trivial to know in advance.
+ const int pass_progress = remaining_progress / (2 + num_pass_left);
+ remaining_progress -= pass_progress;
+ VP8IteratorInit(enc, &it);
+ SetLoopParams(enc, stats.q);
+ if (is_last_pass) {
+ ResetTokenStats(enc);
+ VP8InitFilter(&it); // don't collect stats until last pass (too costly)
+ }
+ VP8TBufferClear(&enc->tokens_);
+ do {
+ VP8ModeScore info;
+ VP8IteratorImport(&it, NULL);
+ if (--cnt < 0) {
+ FinalizeTokenProbas(proba);
+ VP8CalculateLevelCosts(proba); // refresh cost tables for rd-opt
+ cnt = max_count;
+ }
+ VP8Decimate(&it, &info, rd_opt);
+ ok = RecordTokens(&it, &info, &enc->tokens_);
+ if (!ok) {
+ WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+ break;
+ }
+ size_p0 += info.H;
+ distortion += info.D;
+ if (is_last_pass) {
+ StoreSideInfo(&it);
+ VP8StoreFilterStats(&it);
+ VP8IteratorExport(&it);
+ ok = VP8IteratorProgress(&it, pass_progress);
+ }
+ VP8IteratorSaveBoundary(&it);
+ } while (ok && VP8IteratorNext(&it));
+ if (!ok) break;
+
+ size_p0 += enc->segment_hdr_.size_;
+ if (stats.do_size_search) {
+ uint64_t size = FinalizeTokenProbas(&enc->proba_);
+ size += VP8EstimateTokenSize(&enc->tokens_,
+ (const uint8_t*)proba->coeffs_);
+ size = (size + size_p0 + 1024) >> 11; // -> size in bytes
+ size += HEADER_SIZE_ESTIMATE;
+ stats.value = (double)size;
+ } else { // compute and store PSNR
+ stats.value = GetPSNR(distortion, pixel_count);
+ }
+
+#if (DEBUG_SEARCH > 0)
+ printf("#%2d metric:%.1lf -> %.1lf last_q=%.2lf q=%.2lf dq=%.2lf "
+ " range:[%.1f, %.1f]\n",
+ num_pass_left, stats.last_value, stats.value,
+ stats.last_q, stats.q, stats.dq, stats.qmin, stats.qmax);
+#endif
+ if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
+ ++num_pass_left;
+ enc->max_i4_header_bits_ >>= 1; // strengthen header bit limitation...
+ if (is_last_pass) {
+ ResetSideInfo(&it);
+ }
+ continue; // ...and start over
+ }
+ if (is_last_pass) {
+ break; // done
+ }
+ if (do_search) {
+ ComputeNextQ(&stats); // Adjust q
+ }
+ }
+ if (ok) {
+ if (!stats.do_size_search) {
+ FinalizeTokenProbas(&enc->proba_);
+ }
+ ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
+ (const uint8_t*)proba->coeffs_, 1);
+ }
+ ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + remaining_progress,
+ &enc->percent_);
+ return PostLoopFinalize(&it, ok);
+}
+
+#else
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+ (void)enc;
+ return 0; // we shouldn't be here.
+}
+
+#endif // DISABLE_TOKEN_BUFFER
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/histogram_enc.c b/media/libwebp/enc/histogram_enc.c
new file mode 100644
index 0000000000..83f218bcb4
--- /dev/null
+++ b/media/libwebp/enc/histogram_enc.c
@@ -0,0 +1,1252 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include <math.h>
+
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/utils.h"
+
+#define MAX_COST 1.e38
+
+// Number of partitions for the three dominant (literal, red and blue) symbol
+// costs.
+#define NUM_PARTITIONS 4
+// The size of the bin-hash corresponding to the three dominant costs.
+#define BIN_SIZE (NUM_PARTITIONS * NUM_PARTITIONS * NUM_PARTITIONS)
+// Maximum number of histograms allowed in greedy combining algorithm.
+#define MAX_HISTO_GREEDY 100
+
+static void HistogramClear(VP8LHistogram* const p) {
+ uint32_t* const literal = p->literal_;
+ const int cache_bits = p->palette_code_bits_;
+ const int histo_size = VP8LGetHistogramSize(cache_bits);
+ memset(p, 0, histo_size);
+ p->palette_code_bits_ = cache_bits;
+ p->literal_ = literal;
+}
+
+// Swap two histogram pointers.
+static void HistogramSwap(VP8LHistogram** const A, VP8LHistogram** const B) {
+ VP8LHistogram* const tmp = *A;
+ *A = *B;
+ *B = tmp;
+}
+
+static void HistogramCopy(const VP8LHistogram* const src,
+ VP8LHistogram* const dst) {
+ uint32_t* const dst_literal = dst->literal_;
+ const int dst_cache_bits = dst->palette_code_bits_;
+ const int literal_size = VP8LHistogramNumCodes(dst_cache_bits);
+ const int histo_size = VP8LGetHistogramSize(dst_cache_bits);
+ assert(src->palette_code_bits_ == dst_cache_bits);
+ memcpy(dst, src, histo_size);
+ dst->literal_ = dst_literal;
+ memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_));
+}
+
+int VP8LGetHistogramSize(int cache_bits) {
+ const int literal_size = VP8LHistogramNumCodes(cache_bits);
+ const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
+ assert(total_size <= (size_t)0x7fffffff);
+ return (int)total_size;
+}
+
+void VP8LFreeHistogram(VP8LHistogram* const histo) {
+ WebPSafeFree(histo);
+}
+
+void VP8LFreeHistogramSet(VP8LHistogramSet* const histo) {
+ WebPSafeFree(histo);
+}
+
+void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
+ VP8LHistogram* const histo) {
+ VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+ while (VP8LRefsCursorOk(&c)) {
+ VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, NULL, 0);
+ VP8LRefsCursorNext(&c);
+ }
+}
+
+void VP8LHistogramCreate(VP8LHistogram* const p,
+ const VP8LBackwardRefs* const refs,
+ int palette_code_bits) {
+ if (palette_code_bits >= 0) {
+ p->palette_code_bits_ = palette_code_bits;
+ }
+ HistogramClear(p);
+ VP8LHistogramStoreRefs(refs, p);
+}
+
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
+ int init_arrays) {
+ p->palette_code_bits_ = palette_code_bits;
+ if (init_arrays) {
+ HistogramClear(p);
+ } else {
+ p->trivial_symbol_ = 0;
+ p->bit_cost_ = 0.;
+ p->literal_cost_ = 0.;
+ p->red_cost_ = 0.;
+ p->blue_cost_ = 0.;
+ memset(p->is_used_, 0, sizeof(p->is_used_));
+ }
+}
+
+VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
+ VP8LHistogram* histo = NULL;
+ const int total_size = VP8LGetHistogramSize(cache_bits);
+ uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+ if (memory == NULL) return NULL;
+ histo = (VP8LHistogram*)memory;
+ // literal_ won't necessary be aligned.
+ histo->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
+ VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 0);
+ return histo;
+}
+
+// Resets the pointers of the histograms to point to the bit buffer in the set.
+static void HistogramSetResetPointers(VP8LHistogramSet* const set,
+ int cache_bits) {
+ int i;
+ const int histo_size = VP8LGetHistogramSize(cache_bits);
+ uint8_t* memory = (uint8_t*) (set->histograms);
+ memory += set->max_size * sizeof(*set->histograms);
+ for (i = 0; i < set->max_size; ++i) {
+ memory = (uint8_t*) WEBP_ALIGN(memory);
+ set->histograms[i] = (VP8LHistogram*) memory;
+ // literal_ won't necessary be aligned.
+ set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
+ memory += histo_size;
+ }
+}
+
+// Returns the total size of the VP8LHistogramSet.
+static size_t HistogramSetTotalSize(int size, int cache_bits) {
+ const int histo_size = VP8LGetHistogramSize(cache_bits);
+ return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) +
+ histo_size + WEBP_ALIGN_CST));
+}
+
+VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
+ int i;
+ VP8LHistogramSet* set;
+ const size_t total_size = HistogramSetTotalSize(size, cache_bits);
+ uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+ if (memory == NULL) return NULL;
+
+ set = (VP8LHistogramSet*)memory;
+ memory += sizeof(*set);
+ set->histograms = (VP8LHistogram**)memory;
+ set->max_size = size;
+ set->size = size;
+ HistogramSetResetPointers(set, cache_bits);
+ for (i = 0; i < size; ++i) {
+ VP8LHistogramInit(set->histograms[i], cache_bits, /*init_arrays=*/ 0);
+ }
+ return set;
+}
+
+void VP8LHistogramSetClear(VP8LHistogramSet* const set) {
+ int i;
+ const int cache_bits = set->histograms[0]->palette_code_bits_;
+ const int size = set->max_size;
+ const size_t total_size = HistogramSetTotalSize(size, cache_bits);
+ uint8_t* memory = (uint8_t*)set;
+
+ memset(memory, 0, total_size);
+ memory += sizeof(*set);
+ set->histograms = (VP8LHistogram**)memory;
+ set->max_size = size;
+ set->size = size;
+ HistogramSetResetPointers(set, cache_bits);
+ for (i = 0; i < size; ++i) {
+ set->histograms[i]->palette_code_bits_ = cache_bits;
+ }
+}
+
+// Removes the histogram 'i' from 'set' by setting it to NULL.
+static void HistogramSetRemoveHistogram(VP8LHistogramSet* const set, int i,
+ int* const num_used) {
+ assert(set->histograms[i] != NULL);
+ set->histograms[i] = NULL;
+ --*num_used;
+ // If we remove the last valid one, shrink until the next valid one.
+ if (i == set->size - 1) {
+ while (set->size >= 1 && set->histograms[set->size - 1] == NULL) {
+ --set->size;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
+ const PixOrCopy* const v,
+ int (*const distance_modifier)(int, int),
+ int distance_modifier_arg0) {
+ if (PixOrCopyIsLiteral(v)) {
+ ++histo->alpha_[PixOrCopyLiteral(v, 3)];
+ ++histo->red_[PixOrCopyLiteral(v, 2)];
+ ++histo->literal_[PixOrCopyLiteral(v, 1)];
+ ++histo->blue_[PixOrCopyLiteral(v, 0)];
+ } else if (PixOrCopyIsCacheIdx(v)) {
+ const int literal_ix =
+ NUM_LITERAL_CODES + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+ assert(histo->palette_code_bits_ != 0);
+ ++histo->literal_[literal_ix];
+ } else {
+ int code, extra_bits;
+ VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
+ ++histo->literal_[NUM_LITERAL_CODES + code];
+ if (distance_modifier == NULL) {
+ VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+ } else {
+ VP8LPrefixEncodeBits(
+ distance_modifier(distance_modifier_arg0, PixOrCopyDistance(v)),
+ &code, &extra_bits);
+ }
+ ++histo->distance_[code];
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Entropy-related functions.
+
+static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
+ double mix;
+ if (entropy->nonzeros < 5) {
+ if (entropy->nonzeros <= 1) {
+ return 0;
+ }
+ // Two symbols, they will be 0 and 1 in a Huffman code.
+ // Let's mix in a bit of entropy to favor good clustering when
+ // distributions of these are combined.
+ if (entropy->nonzeros == 2) {
+ return 0.99 * entropy->sum + 0.01 * entropy->entropy;
+ }
+ // No matter what the entropy says, we cannot be better than min_limit
+ // with Huffman coding. I am mixing a bit of entropy into the
+ // min_limit since it produces much better (~0.5 %) compression results
+ // perhaps because of better entropy clustering.
+ if (entropy->nonzeros == 3) {
+ mix = 0.95;
+ } else {
+ mix = 0.7; // nonzeros == 4.
+ }
+ } else {
+ mix = 0.627;
+ }
+
+ {
+ double min_limit = 2 * entropy->sum - entropy->max_val;
+ min_limit = mix * min_limit + (1.0 - mix) * entropy->entropy;
+ return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
+ }
+}
+
+double VP8LBitsEntropy(const uint32_t* const array, int n) {
+ VP8LBitEntropy entropy;
+ VP8LBitsEntropyUnrefined(array, n, &entropy);
+
+ return BitsEntropyRefine(&entropy);
+}
+
+static double InitialHuffmanCost(void) {
+ // Small bias because Huffman code length is typically not stored in
+ // full length.
+ static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+ static const double kSmallBias = 9.1;
+ return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+}
+
+// Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
+static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+ // The constants in this function are experimental and got rounded from
+ // their original values in 1/8 when switched to 1/1024.
+ double retval = InitialHuffmanCost();
+ // Second coefficient: Many zeros in the histogram are covered efficiently
+ // by a run-length encode. Originally 2/8.
+ retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+ // Second coefficient: Constant values are encoded less efficiently, but still
+ // RLE'ed. Originally 6/8.
+ retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+ // 0s are usually encoded more efficiently than non-0s.
+ // Originally 15/8.
+ retval += 1.796875 * stats->streaks[0][0];
+ // Originally 26/8.
+ retval += 3.28125 * stats->streaks[1][0];
+ return retval;
+}
+
+// Get the symbol entropy for the distribution 'population'.
+// Set 'trivial_sym', if there's only one symbol present in the distribution.
+static double PopulationCost(const uint32_t* const population, int length,
+ uint32_t* const trivial_sym,
+ uint8_t* const is_used) {
+ VP8LBitEntropy bit_entropy;
+ VP8LStreaks stats;
+ VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
+ if (trivial_sym != NULL) {
+ *trivial_sym = (bit_entropy.nonzeros == 1) ? bit_entropy.nonzero_code
+ : VP8L_NON_TRIVIAL_SYM;
+ }
+ // The histogram is used if there is at least one non-zero streak.
+ *is_used = (stats.streaks[1][0] != 0 || stats.streaks[1][1] != 0);
+
+ return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+}
+
+// trivial_at_end is 1 if the two histograms only have one element that is
+// non-zero: both the zero-th one, or both the last one.
+static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
+ const uint32_t* const Y,
+ int length, int is_X_used,
+ int is_Y_used,
+ int trivial_at_end) {
+ VP8LStreaks stats;
+ if (trivial_at_end) {
+ // This configuration is due to palettization that transforms an indexed
+ // pixel into 0xff000000 | (pixel << 8) in VP8LBundleColorMap.
+ // BitsEntropyRefine is 0 for histograms with only one non-zero value.
+ // Only FinalHuffmanCost needs to be evaluated.
+ memset(&stats, 0, sizeof(stats));
+ // Deal with the non-zero value at index 0 or length-1.
+ stats.streaks[1][0] = 1;
+ // Deal with the following/previous zero streak.
+ stats.counts[0] = 1;
+ stats.streaks[0][1] = length - 1;
+ return FinalHuffmanCost(&stats);
+ } else {
+ VP8LBitEntropy bit_entropy;
+ if (is_X_used) {
+ if (is_Y_used) {
+ VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+ } else {
+ VP8LGetEntropyUnrefined(X, length, &bit_entropy, &stats);
+ }
+ } else {
+ if (is_Y_used) {
+ VP8LGetEntropyUnrefined(Y, length, &bit_entropy, &stats);
+ } else {
+ memset(&stats, 0, sizeof(stats));
+ stats.counts[0] = 1;
+ stats.streaks[0][length > 3] = length;
+ VP8LBitEntropyInit(&bit_entropy);
+ }
+ }
+
+ return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+ }
+}
+
+// Estimates the Entropy + Huffman + other block overhead size cost.
+double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
+ return
+ PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
+ NULL, &p->is_used_[0])
+ + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1])
+ + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2])
+ + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3])
+ + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4])
+ + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
+ + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
+}
+
+// -----------------------------------------------------------------------------
+// Various histogram combine/cost-eval functions
+
+static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
+ const VP8LHistogram* const b,
+ double cost_threshold,
+ double* cost) {
+ const int palette_code_bits = a->palette_code_bits_;
+ int trivial_at_end = 0;
+ assert(a->palette_code_bits_ == b->palette_code_bits_);
+ *cost += GetCombinedEntropy(a->literal_, b->literal_,
+ VP8LHistogramNumCodes(palette_code_bits),
+ a->is_used_[0], b->is_used_[0], 0);
+ *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
+ b->literal_ + NUM_LITERAL_CODES,
+ NUM_LENGTH_CODES);
+ if (*cost > cost_threshold) return 0;
+
+ if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
+ a->trivial_symbol_ == b->trivial_symbol_) {
+ // A, R and B are all 0 or 0xff.
+ const uint32_t color_a = (a->trivial_symbol_ >> 24) & 0xff;
+ const uint32_t color_r = (a->trivial_symbol_ >> 16) & 0xff;
+ const uint32_t color_b = (a->trivial_symbol_ >> 0) & 0xff;
+ if ((color_a == 0 || color_a == 0xff) &&
+ (color_r == 0 || color_r == 0xff) &&
+ (color_b == 0 || color_b == 0xff)) {
+ trivial_at_end = 1;
+ }
+ }
+
+ *cost +=
+ GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, a->is_used_[1],
+ b->is_used_[1], trivial_at_end);
+ if (*cost > cost_threshold) return 0;
+
+ *cost +=
+ GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, a->is_used_[2],
+ b->is_used_[2], trivial_at_end);
+ if (*cost > cost_threshold) return 0;
+
+ *cost +=
+ GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
+ a->is_used_[3], b->is_used_[3], trivial_at_end);
+ if (*cost > cost_threshold) return 0;
+
+ *cost +=
+ GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
+ a->is_used_[4], b->is_used_[4], 0);
+ *cost +=
+ VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+ if (*cost > cost_threshold) return 0;
+
+ return 1;
+}
+
+static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
+ const VP8LHistogram* const b,
+ VP8LHistogram* const out) {
+ VP8LHistogramAdd(a, b, out);
+ out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_)
+ ? a->trivial_symbol_
+ : VP8L_NON_TRIVIAL_SYM;
+}
+
+// Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
+// to the threshold value 'cost_threshold'. The score returned is
+// Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
+// Since the previous score passed is 'cost_threshold', we only need to compare
+// the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
+// early.
+static double HistogramAddEval(const VP8LHistogram* const a,
+ const VP8LHistogram* const b,
+ VP8LHistogram* const out,
+ double cost_threshold) {
+ double cost = 0;
+ const double sum_cost = a->bit_cost_ + b->bit_cost_;
+ cost_threshold += sum_cost;
+
+ if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
+ HistogramAdd(a, b, out);
+ out->bit_cost_ = cost;
+ out->palette_code_bits_ = a->palette_code_bits_;
+ }
+
+ return cost - sum_cost;
+}
+
+// Same as HistogramAddEval(), except that the resulting histogram
+// is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
+// the term C(b) which is constant over all the evaluations.
+static double HistogramAddThresh(const VP8LHistogram* const a,
+ const VP8LHistogram* const b,
+ double cost_threshold) {
+ double cost;
+ assert(a != NULL && b != NULL);
+ cost = -a->bit_cost_;
+ GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
+ return cost;
+}
+
+// -----------------------------------------------------------------------------
+
+// The structure to keep track of cost range for the three dominant entropy
+// symbols.
+// TODO(skal): Evaluate if float can be used here instead of double for
+// representing the entropy costs.
+typedef struct {
+ double literal_max_;
+ double literal_min_;
+ double red_max_;
+ double red_min_;
+ double blue_max_;
+ double blue_min_;
+} DominantCostRange;
+
+static void DominantCostRangeInit(DominantCostRange* const c) {
+ c->literal_max_ = 0.;
+ c->literal_min_ = MAX_COST;
+ c->red_max_ = 0.;
+ c->red_min_ = MAX_COST;
+ c->blue_max_ = 0.;
+ c->blue_min_ = MAX_COST;
+}
+
+static void UpdateDominantCostRange(
+ const VP8LHistogram* const h, DominantCostRange* const c) {
+ if (c->literal_max_ < h->literal_cost_) c->literal_max_ = h->literal_cost_;
+ if (c->literal_min_ > h->literal_cost_) c->literal_min_ = h->literal_cost_;
+ if (c->red_max_ < h->red_cost_) c->red_max_ = h->red_cost_;
+ if (c->red_min_ > h->red_cost_) c->red_min_ = h->red_cost_;
+ if (c->blue_max_ < h->blue_cost_) c->blue_max_ = h->blue_cost_;
+ if (c->blue_min_ > h->blue_cost_) c->blue_min_ = h->blue_cost_;
+}
+
+static void UpdateHistogramCost(VP8LHistogram* const h) {
+ uint32_t alpha_sym, red_sym, blue_sym;
+ const double alpha_cost =
+ PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym,
+ &h->is_used_[3]);
+ const double distance_cost =
+ PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
+ VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
+ const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
+ h->literal_cost_ =
+ PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
+ VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
+ h->red_cost_ =
+ PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
+ h->blue_cost_ =
+ PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym, &h->is_used_[2]);
+ h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
+ alpha_cost + distance_cost;
+ if ((alpha_sym | red_sym | blue_sym) == VP8L_NON_TRIVIAL_SYM) {
+ h->trivial_symbol_ = VP8L_NON_TRIVIAL_SYM;
+ } else {
+ h->trivial_symbol_ =
+ ((uint32_t)alpha_sym << 24) | (red_sym << 16) | (blue_sym << 0);
+ }
+}
+
+static int GetBinIdForEntropy(double min, double max, double val) {
+ const double range = max - min;
+ if (range > 0.) {
+ const double delta = val - min;
+ return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
+ } else {
+ return 0;
+ }
+}
+
+static int GetHistoBinIndex(const VP8LHistogram* const h,
+ const DominantCostRange* const c, int low_effort) {
+ int bin_id = GetBinIdForEntropy(c->literal_min_, c->literal_max_,
+ h->literal_cost_);
+ assert(bin_id < NUM_PARTITIONS);
+ if (!low_effort) {
+ bin_id = bin_id * NUM_PARTITIONS
+ + GetBinIdForEntropy(c->red_min_, c->red_max_, h->red_cost_);
+ bin_id = bin_id * NUM_PARTITIONS
+ + GetBinIdForEntropy(c->blue_min_, c->blue_max_, h->blue_cost_);
+ assert(bin_id < BIN_SIZE);
+ }
+ return bin_id;
+}
+
+// Construct the histograms from backward references.
+static void HistogramBuild(
+ int xsize, int histo_bits, const VP8LBackwardRefs* const backward_refs,
+ VP8LHistogramSet* const image_histo) {
+ int x = 0, y = 0;
+ const int histo_xsize = VP8LSubSampleSize(xsize, histo_bits);
+ VP8LHistogram** const histograms = image_histo->histograms;
+ VP8LRefsCursor c = VP8LRefsCursorInit(backward_refs);
+ assert(histo_bits > 0);
+ VP8LHistogramSetClear(image_histo);
+ while (VP8LRefsCursorOk(&c)) {
+ const PixOrCopy* const v = c.cur_pos;
+ const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
+ VP8LHistogramAddSinglePixOrCopy(histograms[ix], v, NULL, 0);
+ x += PixOrCopyLength(v);
+ while (x >= xsize) {
+ x -= xsize;
+ ++y;
+ }
+ VP8LRefsCursorNext(&c);
+ }
+}
+
+// Copies the histograms and computes its bit_cost.
+static const uint16_t kInvalidHistogramSymbol = (uint16_t)(-1);
+static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo,
+ VP8LHistogramSet* const image_histo,
+ int* const num_used,
+ uint16_t* const histogram_symbols) {
+ int i, cluster_id;
+ int num_used_orig = *num_used;
+ VP8LHistogram** const orig_histograms = orig_histo->histograms;
+ VP8LHistogram** const histograms = image_histo->histograms;
+ assert(image_histo->max_size == orig_histo->max_size);
+ for (cluster_id = 0, i = 0; i < orig_histo->max_size; ++i) {
+ VP8LHistogram* const histo = orig_histograms[i];
+ UpdateHistogramCost(histo);
+
+ // Skip the histogram if it is completely empty, which can happen for tiles
+ // with no information (when they are skipped because of LZ77).
+ if (!histo->is_used_[0] && !histo->is_used_[1] && !histo->is_used_[2]
+ && !histo->is_used_[3] && !histo->is_used_[4]) {
+ // The first histogram is always used. If an histogram is empty, we set
+ // its id to be the same as the previous one: this will improve
+ // compressibility for later LZ77.
+ assert(i > 0);
+ HistogramSetRemoveHistogram(image_histo, i, num_used);
+ HistogramSetRemoveHistogram(orig_histo, i, &num_used_orig);
+ histogram_symbols[i] = kInvalidHistogramSymbol;
+ } else {
+ // Copy histograms from orig_histo[] to image_histo[].
+ HistogramCopy(histo, histograms[i]);
+ histogram_symbols[i] = cluster_id++;
+ assert(cluster_id <= image_histo->max_size);
+ }
+ }
+}
+
+// Partition histograms to different entropy bins for three dominant (literal,
+// red and blue) symbol costs and compute the histogram aggregate bit_cost.
+static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
+ uint16_t* const bin_map,
+ int low_effort) {
+ int i;
+ VP8LHistogram** const histograms = image_histo->histograms;
+ const int histo_size = image_histo->size;
+ DominantCostRange cost_range;
+ DominantCostRangeInit(&cost_range);
+
+ // Analyze the dominant (literal, red and blue) entropy costs.
+ for (i = 0; i < histo_size; ++i) {
+ if (histograms[i] == NULL) continue;
+ UpdateDominantCostRange(histograms[i], &cost_range);
+ }
+
+ // bin-hash histograms on three of the dominant (literal, red and blue)
+ // symbol costs and store the resulting bin_id for each histogram.
+ for (i = 0; i < histo_size; ++i) {
+ // bin_map[i] is not set to a special value as its use will later be guarded
+ // by another (histograms[i] == NULL).
+ if (histograms[i] == NULL) continue;
+ bin_map[i] = GetHistoBinIndex(histograms[i], &cost_range, low_effort);
+ }
+}
+
+// Merges some histograms with same bin_id together if it's advantageous.
+// Sets the remaining histograms to NULL.
+static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
+ int* num_used,
+ const uint16_t* const clusters,
+ uint16_t* const cluster_mappings,
+ VP8LHistogram* cur_combo,
+ const uint16_t* const bin_map,
+ int num_bins,
+ double combine_cost_factor,
+ int low_effort) {
+ VP8LHistogram** const histograms = image_histo->histograms;
+ int idx;
+ struct {
+ int16_t first; // position of the histogram that accumulates all
+ // histograms with the same bin_id
+ uint16_t num_combine_failures; // number of combine failures per bin_id
+ } bin_info[BIN_SIZE];
+
+ assert(num_bins <= BIN_SIZE);
+ for (idx = 0; idx < num_bins; ++idx) {
+ bin_info[idx].first = -1;
+ bin_info[idx].num_combine_failures = 0;
+ }
+
+ // By default, a cluster matches itself.
+ for (idx = 0; idx < *num_used; ++idx) cluster_mappings[idx] = idx;
+ for (idx = 0; idx < image_histo->size; ++idx) {
+ int bin_id, first;
+ if (histograms[idx] == NULL) continue;
+ bin_id = bin_map[idx];
+ first = bin_info[bin_id].first;
+ if (first == -1) {
+ bin_info[bin_id].first = idx;
+ } else if (low_effort) {
+ HistogramAdd(histograms[idx], histograms[first], histograms[first]);
+ HistogramSetRemoveHistogram(image_histo, idx, num_used);
+ cluster_mappings[clusters[idx]] = clusters[first];
+ } else {
+ // try to merge #idx into #first (both share the same bin_id)
+ const double bit_cost = histograms[idx]->bit_cost_;
+ const double bit_cost_thresh = -bit_cost * combine_cost_factor;
+ const double curr_cost_diff =
+ HistogramAddEval(histograms[first], histograms[idx],
+ cur_combo, bit_cost_thresh);
+ if (curr_cost_diff < bit_cost_thresh) {
+ // Try to merge two histograms only if the combo is a trivial one or
+ // the two candidate histograms are already non-trivial.
+ // For some images, 'try_combine' turns out to be false for a lot of
+ // histogram pairs. In that case, we fallback to combining
+ // histograms as usual to avoid increasing the header size.
+ const int try_combine =
+ (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
+ ((histograms[idx]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
+ (histograms[first]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
+ const int max_combine_failures = 32;
+ if (try_combine ||
+ bin_info[bin_id].num_combine_failures >= max_combine_failures) {
+ // move the (better) merged histogram to its final slot
+ HistogramSwap(&cur_combo, &histograms[first]);
+ HistogramSetRemoveHistogram(image_histo, idx, num_used);
+ cluster_mappings[clusters[idx]] = clusters[first];
+ } else {
+ ++bin_info[bin_id].num_combine_failures;
+ }
+ }
+ }
+ }
+ if (low_effort) {
+ // for low_effort case, update the final cost when everything is merged
+ for (idx = 0; idx < image_histo->size; ++idx) {
+ if (histograms[idx] == NULL) continue;
+ UpdateHistogramCost(histograms[idx]);
+ }
+ }
+}
+
+// Implement a Lehmer random number generator with a multiplicative constant of
+// 48271 and a modulo constant of 2^31 - 1.
+static uint32_t MyRand(uint32_t* const seed) {
+ *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
+ assert(*seed > 0);
+ return *seed;
+}
+
+// -----------------------------------------------------------------------------
+// Histogram pairs priority queue
+
+// Pair of histograms. Negative idx1 value means that pair is out-of-date.
+typedef struct {
+ int idx1;
+ int idx2;
+ double cost_diff;
+ double cost_combo;
+} HistogramPair;
+
+typedef struct {
+ HistogramPair* queue;
+ int size;
+ int max_size;
+} HistoQueue;
+
+static int HistoQueueInit(HistoQueue* const histo_queue, const int max_size) {
+ histo_queue->size = 0;
+ histo_queue->max_size = max_size;
+ // We allocate max_size + 1 because the last element at index "size" is
+ // used as temporary data (and it could be up to max_size).
+ histo_queue->queue = (HistogramPair*)WebPSafeMalloc(
+ histo_queue->max_size + 1, sizeof(*histo_queue->queue));
+ return histo_queue->queue != NULL;
+}
+
+static void HistoQueueClear(HistoQueue* const histo_queue) {
+ assert(histo_queue != NULL);
+ WebPSafeFree(histo_queue->queue);
+ histo_queue->size = 0;
+ histo_queue->max_size = 0;
+}
+
+// Pop a specific pair in the queue by replacing it with the last one
+// and shrinking the queue.
+static void HistoQueuePopPair(HistoQueue* const histo_queue,
+ HistogramPair* const pair) {
+ assert(pair >= histo_queue->queue &&
+ pair < (histo_queue->queue + histo_queue->size));
+ assert(histo_queue->size > 0);
+ *pair = histo_queue->queue[histo_queue->size - 1];
+ --histo_queue->size;
+}
+
+// Check whether a pair in the queue should be updated as head or not.
+static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
+ HistogramPair* const pair) {
+ assert(pair->cost_diff < 0.);
+ assert(pair >= histo_queue->queue &&
+ pair < (histo_queue->queue + histo_queue->size));
+ assert(histo_queue->size > 0);
+ if (pair->cost_diff < histo_queue->queue[0].cost_diff) {
+ // Replace the best pair.
+ const HistogramPair tmp = histo_queue->queue[0];
+ histo_queue->queue[0] = *pair;
+ *pair = tmp;
+ }
+}
+
+// Update the cost diff and combo of a pair of histograms. This needs to be
+// called when the the histograms have been merged with a third one.
+static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
+ const VP8LHistogram* const h2,
+ double threshold,
+ HistogramPair* const pair) {
+ const double sum_cost = h1->bit_cost_ + h2->bit_cost_;
+ pair->cost_combo = 0.;
+ GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo);
+ pair->cost_diff = pair->cost_combo - sum_cost;
+}
+
+// Create a pair from indices "idx1" and "idx2" provided its cost
+// is inferior to "threshold", a negative entropy.
+// It returns the cost of the pair, or 0. if it superior to threshold.
+static double HistoQueuePush(HistoQueue* const histo_queue,
+ VP8LHistogram** const histograms, int idx1,
+ int idx2, double threshold) {
+ const VP8LHistogram* h1;
+ const VP8LHistogram* h2;
+ HistogramPair pair;
+
+ // Stop here if the queue is full.
+ if (histo_queue->size == histo_queue->max_size) return 0.;
+ assert(threshold <= 0.);
+ if (idx1 > idx2) {
+ const int tmp = idx2;
+ idx2 = idx1;
+ idx1 = tmp;
+ }
+ pair.idx1 = idx1;
+ pair.idx2 = idx2;
+ h1 = histograms[idx1];
+ h2 = histograms[idx2];
+
+ HistoQueueUpdatePair(h1, h2, threshold, &pair);
+
+ // Do not even consider the pair if it does not improve the entropy.
+ if (pair.cost_diff >= threshold) return 0.;
+
+ histo_queue->queue[histo_queue->size++] = pair;
+ HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
+
+ return pair.cost_diff;
+}
+
+// -----------------------------------------------------------------------------
+
+// Combines histograms by continuously choosing the one with the highest cost
+// reduction.
+static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
+ int* const num_used) {
+ int ok = 0;
+ const int image_histo_size = image_histo->size;
+ int i, j;
+ VP8LHistogram** const histograms = image_histo->histograms;
+ // Priority queue of histogram pairs.
+ HistoQueue histo_queue;
+
+ // image_histo_size^2 for the queue size is safe. If you look at
+ // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes
+ // data to the queue, you insert at most:
+ // - image_histo_size*(image_histo_size-1)/2 (the first two for loops)
+ // - image_histo_size - 1 in the last for loop at the first iteration of
+ // the while loop, image_histo_size - 2 at the second iteration ...
+ // therefore image_histo_size*(image_histo_size-1)/2 overall too
+ if (!HistoQueueInit(&histo_queue, image_histo_size * image_histo_size)) {
+ goto End;
+ }
+
+ for (i = 0; i < image_histo_size; ++i) {
+ if (image_histo->histograms[i] == NULL) continue;
+ for (j = i + 1; j < image_histo_size; ++j) {
+ // Initialize queue.
+ if (image_histo->histograms[j] == NULL) continue;
+ HistoQueuePush(&histo_queue, histograms, i, j, 0.);
+ }
+ }
+
+ while (histo_queue.size > 0) {
+ const int idx1 = histo_queue.queue[0].idx1;
+ const int idx2 = histo_queue.queue[0].idx2;
+ HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
+ histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+
+ // Remove merged histogram.
+ HistogramSetRemoveHistogram(image_histo, idx2, num_used);
+
+ // Remove pairs intersecting the just combined best pair.
+ for (i = 0; i < histo_queue.size;) {
+ HistogramPair* const p = histo_queue.queue + i;
+ if (p->idx1 == idx1 || p->idx2 == idx1 ||
+ p->idx1 == idx2 || p->idx2 == idx2) {
+ HistoQueuePopPair(&histo_queue, p);
+ } else {
+ HistoQueueUpdateHead(&histo_queue, p);
+ ++i;
+ }
+ }
+
+ // Push new pairs formed with combined histogram to the queue.
+ for (i = 0; i < image_histo->size; ++i) {
+ if (i == idx1 || image_histo->histograms[i] == NULL) continue;
+ HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0.);
+ }
+ }
+
+ ok = 1;
+
+ End:
+ HistoQueueClear(&histo_queue);
+ return ok;
+}
+
+// Perform histogram aggregation using a stochastic approach.
+// 'do_greedy' is set to 1 if a greedy approach needs to be performed
+// afterwards, 0 otherwise.
+static int PairComparison(const void* idx1, const void* idx2) {
+ // To be used with bsearch: <0 when *idx1<*idx2, >0 if >, 0 when ==.
+ return (*(int*) idx1 - *(int*) idx2);
+}
+static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
+ int* const num_used, int min_cluster_size,
+ int* const do_greedy) {
+ int j, iter;
+ uint32_t seed = 1;
+ int tries_with_no_success = 0;
+ const int outer_iters = *num_used;
+ const int num_tries_no_success = outer_iters / 2;
+ VP8LHistogram** const histograms = image_histo->histograms;
+ // Priority queue of histogram pairs. Its size of 'kHistoQueueSize'
+ // impacts the quality of the compression and the speed: the smaller the
+ // faster but the worse for the compression.
+ HistoQueue histo_queue;
+ const int kHistoQueueSize = 9;
+ int ok = 0;
+ // mapping from an index in image_histo with no NULL histogram to the full
+ // blown image_histo.
+ int* mappings;
+
+ if (*num_used < min_cluster_size) {
+ *do_greedy = 1;
+ return 1;
+ }
+
+ mappings = (int*) WebPSafeMalloc(*num_used, sizeof(*mappings));
+ if (mappings == NULL) return 0;
+ if (!HistoQueueInit(&histo_queue, kHistoQueueSize)) goto End;
+ // Fill the initial mapping.
+ for (j = 0, iter = 0; iter < image_histo->size; ++iter) {
+ if (histograms[iter] == NULL) continue;
+ mappings[j++] = iter;
+ }
+ assert(j == *num_used);
+
+ // Collapse similar histograms in 'image_histo'.
+ for (iter = 0;
+ iter < outer_iters && *num_used >= min_cluster_size &&
+ ++tries_with_no_success < num_tries_no_success;
+ ++iter) {
+ int* mapping_index;
+ double best_cost =
+ (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
+ int best_idx1 = -1, best_idx2 = 1;
+ const uint32_t rand_range = (*num_used - 1) * (*num_used);
+ // (*num_used) / 2 was chosen empirically. Less means faster but worse
+ // compression.
+ const int num_tries = (*num_used) / 2;
+
+ // Pick random samples.
+ for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
+ double curr_cost;
+ // Choose two different histograms at random and try to combine them.
+ const uint32_t tmp = MyRand(&seed) % rand_range;
+ uint32_t idx1 = tmp / (*num_used - 1);
+ uint32_t idx2 = tmp % (*num_used - 1);
+ if (idx2 >= idx1) ++idx2;
+ idx1 = mappings[idx1];
+ idx2 = mappings[idx2];
+
+ // Calculate cost reduction on combination.
+ curr_cost =
+ HistoQueuePush(&histo_queue, histograms, idx1, idx2, best_cost);
+ if (curr_cost < 0) { // found a better pair?
+ best_cost = curr_cost;
+ // Empty the queue if we reached full capacity.
+ if (histo_queue.size == histo_queue.max_size) break;
+ }
+ }
+ if (histo_queue.size == 0) continue;
+
+ // Get the best histograms.
+ best_idx1 = histo_queue.queue[0].idx1;
+ best_idx2 = histo_queue.queue[0].idx2;
+ assert(best_idx1 < best_idx2);
+ // Pop best_idx2 from mappings.
+ mapping_index = (int*) bsearch(&best_idx2, mappings, *num_used,
+ sizeof(best_idx2), &PairComparison);
+ assert(mapping_index != NULL);
+ memmove(mapping_index, mapping_index + 1, sizeof(*mapping_index) *
+ ((*num_used) - (mapping_index - mappings) - 1));
+ // Merge the histograms and remove best_idx2 from the queue.
+ HistogramAdd(histograms[best_idx2], histograms[best_idx1],
+ histograms[best_idx1]);
+ histograms[best_idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+ HistogramSetRemoveHistogram(image_histo, best_idx2, num_used);
+ // Parse the queue and update each pair that deals with best_idx1,
+ // best_idx2 or image_histo_size.
+ for (j = 0; j < histo_queue.size;) {
+ HistogramPair* const p = histo_queue.queue + j;
+ const int is_idx1_best = p->idx1 == best_idx1 || p->idx1 == best_idx2;
+ const int is_idx2_best = p->idx2 == best_idx1 || p->idx2 == best_idx2;
+ int do_eval = 0;
+ // The front pair could have been duplicated by a random pick so
+ // check for it all the time nevertheless.
+ if (is_idx1_best && is_idx2_best) {
+ HistoQueuePopPair(&histo_queue, p);
+ continue;
+ }
+ // Any pair containing one of the two best indices should only refer to
+ // best_idx1. Its cost should also be updated.
+ if (is_idx1_best) {
+ p->idx1 = best_idx1;
+ do_eval = 1;
+ } else if (is_idx2_best) {
+ p->idx2 = best_idx1;
+ do_eval = 1;
+ }
+ // Make sure the index order is respected.
+ if (p->idx1 > p->idx2) {
+ const int tmp = p->idx2;
+ p->idx2 = p->idx1;
+ p->idx1 = tmp;
+ }
+ if (do_eval) {
+ // Re-evaluate the cost of an updated pair.
+ HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0., p);
+ if (p->cost_diff >= 0.) {
+ HistoQueuePopPair(&histo_queue, p);
+ continue;
+ }
+ }
+ HistoQueueUpdateHead(&histo_queue, p);
+ ++j;
+ }
+ tries_with_no_success = 0;
+ }
+ *do_greedy = (*num_used <= min_cluster_size);
+ ok = 1;
+
+End:
+ HistoQueueClear(&histo_queue);
+ WebPSafeFree(mappings);
+ return ok;
+}
+
+// -----------------------------------------------------------------------------
+// Histogram refinement
+
+// Find the best 'out' histogram for each of the 'in' histograms.
+// At call-time, 'out' contains the histograms of the clusters.
+// Note: we assume that out[]->bit_cost_ is already up-to-date.
+static void HistogramRemap(const VP8LHistogramSet* const in,
+ VP8LHistogramSet* const out,
+ uint16_t* const symbols) {
+ int i;
+ VP8LHistogram** const in_histo = in->histograms;
+ VP8LHistogram** const out_histo = out->histograms;
+ const int in_size = out->max_size;
+ const int out_size = out->size;
+ if (out_size > 1) {
+ for (i = 0; i < in_size; ++i) {
+ int best_out = 0;
+ double best_bits = MAX_COST;
+ int k;
+ if (in_histo[i] == NULL) {
+ // Arbitrarily set to the previous value if unused to help future LZ77.
+ symbols[i] = symbols[i - 1];
+ continue;
+ }
+ for (k = 0; k < out_size; ++k) {
+ double cur_bits;
+ cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
+ if (k == 0 || cur_bits < best_bits) {
+ best_bits = cur_bits;
+ best_out = k;
+ }
+ }
+ symbols[i] = best_out;
+ }
+ } else {
+ assert(out_size == 1);
+ for (i = 0; i < in_size; ++i) {
+ symbols[i] = 0;
+ }
+ }
+
+ // Recompute each out based on raw and symbols.
+ VP8LHistogramSetClear(out);
+ out->size = out_size;
+
+ for (i = 0; i < in_size; ++i) {
+ int idx;
+ if (in_histo[i] == NULL) continue;
+ idx = symbols[i];
+ HistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
+ }
+}
+
+static double GetCombineCostFactor(int histo_size, int quality) {
+ double combine_cost_factor = 0.16;
+ if (quality < 90) {
+ if (histo_size > 256) combine_cost_factor /= 2.;
+ if (histo_size > 512) combine_cost_factor /= 2.;
+ if (histo_size > 1024) combine_cost_factor /= 2.;
+ if (quality <= 50) combine_cost_factor /= 2.;
+ }
+ return combine_cost_factor;
+}
+
+// Given a HistogramSet 'set', the mapping of clusters 'cluster_mapping' and the
+// current assignment of the cells in 'symbols', merge the clusters and
+// assign the smallest possible clusters values.
+static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
+ uint16_t* const cluster_mappings,
+ int num_clusters,
+ uint16_t* const cluster_mappings_tmp,
+ uint16_t* const symbols) {
+ int i, cluster_max;
+ int do_continue = 1;
+ // First, assign the lowest cluster to each pixel.
+ while (do_continue) {
+ do_continue = 0;
+ for (i = 0; i < num_clusters; ++i) {
+ int k;
+ k = cluster_mappings[i];
+ while (k != cluster_mappings[k]) {
+ cluster_mappings[k] = cluster_mappings[cluster_mappings[k]];
+ k = cluster_mappings[k];
+ }
+ if (k != cluster_mappings[i]) {
+ do_continue = 1;
+ cluster_mappings[i] = k;
+ }
+ }
+ }
+ // Create a mapping from a cluster id to its minimal version.
+ cluster_max = 0;
+ memset(cluster_mappings_tmp, 0,
+ set->max_size * sizeof(*cluster_mappings_tmp));
+ assert(cluster_mappings[0] == 0);
+ // Re-map the ids.
+ for (i = 0; i < set->max_size; ++i) {
+ int cluster;
+ if (symbols[i] == kInvalidHistogramSymbol) continue;
+ cluster = cluster_mappings[symbols[i]];
+ assert(symbols[i] < num_clusters);
+ if (cluster > 0 && cluster_mappings_tmp[cluster] == 0) {
+ ++cluster_max;
+ cluster_mappings_tmp[cluster] = cluster_max;
+ }
+ symbols[i] = cluster_mappings_tmp[cluster];
+ }
+
+ // Make sure all cluster values are used.
+ cluster_max = 0;
+ for (i = 0; i < set->max_size; ++i) {
+ if (symbols[i] == kInvalidHistogramSymbol) continue;
+ if (symbols[i] <= cluster_max) continue;
+ ++cluster_max;
+ assert(symbols[i] == cluster_max);
+ }
+}
+
+static void RemoveEmptyHistograms(VP8LHistogramSet* const image_histo) {
+ uint32_t size;
+ int i;
+ for (i = 0, size = 0; i < image_histo->size; ++i) {
+ if (image_histo->histograms[i] == NULL) continue;
+ image_histo->histograms[size++] = image_histo->histograms[i];
+ }
+ image_histo->size = size;
+}
+
+int VP8LGetHistoImageSymbols(int xsize, int ysize,
+ const VP8LBackwardRefs* const refs,
+ int quality, int low_effort,
+ int histogram_bits, int cache_bits,
+ VP8LHistogramSet* const image_histo,
+ VP8LHistogram* const tmp_histo,
+ uint16_t* const histogram_symbols) {
+ int ok = 0;
+ const int histo_xsize =
+ histogram_bits ? VP8LSubSampleSize(xsize, histogram_bits) : 1;
+ const int histo_ysize =
+ histogram_bits ? VP8LSubSampleSize(ysize, histogram_bits) : 1;
+ const int image_histo_raw_size = histo_xsize * histo_ysize;
+ VP8LHistogramSet* const orig_histo =
+ VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
+ // Don't attempt linear bin-partition heuristic for
+ // histograms of small sizes (as bin_map will be very sparse) and
+ // maximum quality q==100 (to preserve the compression gains at that level).
+ const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
+ int entropy_combine;
+ uint16_t* const map_tmp =
+ WebPSafeMalloc(2 * image_histo_raw_size, sizeof(map_tmp));
+ uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
+ int num_used = image_histo_raw_size;
+ if (orig_histo == NULL || map_tmp == NULL) goto Error;
+
+ // Construct the histograms from backward references.
+ HistogramBuild(xsize, histogram_bits, refs, orig_histo);
+ // Copies the histograms and computes its bit_cost.
+ // histogram_symbols is optimized
+ HistogramCopyAndAnalyze(orig_histo, image_histo, &num_used,
+ histogram_symbols);
+
+ entropy_combine =
+ (num_used > entropy_combine_num_bins * 2) && (quality < 100);
+
+ if (entropy_combine) {
+ uint16_t* const bin_map = map_tmp;
+ const double combine_cost_factor =
+ GetCombineCostFactor(image_histo_raw_size, quality);
+ const uint32_t num_clusters = num_used;
+
+ HistogramAnalyzeEntropyBin(image_histo, bin_map, low_effort);
+ // Collapse histograms with similar entropy.
+ HistogramCombineEntropyBin(image_histo, &num_used, histogram_symbols,
+ cluster_mappings, tmp_histo, bin_map,
+ entropy_combine_num_bins, combine_cost_factor,
+ low_effort);
+ OptimizeHistogramSymbols(image_histo, cluster_mappings, num_clusters,
+ map_tmp, histogram_symbols);
+ }
+
+ // Don't combine the histograms using stochastic and greedy heuristics for
+ // low-effort compression mode.
+ if (!low_effort || !entropy_combine) {
+ const float x = quality / 100.f;
+ // cubic ramp between 1 and MAX_HISTO_GREEDY:
+ const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
+ int do_greedy;
+ if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
+ &do_greedy)) {
+ goto Error;
+ }
+ if (do_greedy) {
+ RemoveEmptyHistograms(image_histo);
+ if (!HistogramCombineGreedy(image_histo, &num_used)) {
+ goto Error;
+ }
+ }
+ }
+
+ // Find the optimal map from original histograms to the final ones.
+ RemoveEmptyHistograms(image_histo);
+ HistogramRemap(orig_histo, image_histo, histogram_symbols);
+
+ ok = 1;
+
+ Error:
+ VP8LFreeHistogramSet(orig_histo);
+ WebPSafeFree(map_tmp);
+ return ok;
+}
diff --git a/media/libwebp/enc/histogram_enc.h b/media/libwebp/enc/histogram_enc.h
index ef39b7c6db..bf93ce62b3 100644
--- a/media/libwebp/enc/histogram_enc.h
+++ b/media/libwebp/enc/histogram_enc.h
@@ -64,8 +64,8 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
const VP8LBackwardRefs* const refs,
int palette_code_bits);
-// Return the size of the histogram for a given palette_code_bits.
-int VP8LGetHistogramSize(int palette_code_bits);
+// Return the size of the histogram for a given cache_bits.
+int VP8LGetHistogramSize(int cache_bits);
// Set the palette_code_bits and reset the stats.
// If init_arrays is true, the arrays are also filled with 0's.
@@ -110,7 +110,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
const VP8LBackwardRefs* const refs,
int quality, int low_effort,
int histogram_bits, int cache_bits,
- VP8LHistogramSet* const image_in,
+ VP8LHistogramSet* const image_histo,
VP8LHistogram* const tmp_histo,
uint16_t* const histogram_symbols);
diff --git a/media/libwebp/enc/iterator_enc.c b/media/libwebp/enc/iterator_enc.c
new file mode 100644
index 0000000000..c2b137c124
--- /dev/null
+++ b/media/libwebp/enc/iterator_enc.c
@@ -0,0 +1,459 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// VP8Iterator: block iterator
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <string.h>
+
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// VP8Iterator
+//------------------------------------------------------------------------------
+
+static void InitLeft(VP8EncIterator* const it) {
+ it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] =
+ (it->y_ > 0) ? 129 : 127;
+ memset(it->y_left_, 129, 16);
+ memset(it->u_left_, 129, 8);
+ memset(it->v_left_, 129, 8);
+ it->left_nz_[8] = 0;
+ if (it->top_derr_ != NULL) {
+ memset(&it->left_derr_, 0, sizeof(it->left_derr_));
+ }
+}
+
+static void InitTop(VP8EncIterator* const it) {
+ const VP8Encoder* const enc = it->enc_;
+ const size_t top_size = enc->mb_w_ * 16;
+ memset(enc->y_top_, 127, 2 * top_size);
+ memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
+ if (enc->top_derr_ != NULL) {
+ memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
+ }
+}
+
+void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
+ VP8Encoder* const enc = it->enc_;
+ it->x_ = 0;
+ it->y_ = y;
+ it->bw_ = &enc->parts_[y & (enc->num_parts_ - 1)];
+ it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
+ it->nz_ = enc->nz_;
+ it->mb_ = enc->mb_info_ + y * enc->mb_w_;
+ it->y_top_ = enc->y_top_;
+ it->uv_top_ = enc->uv_top_;
+ InitLeft(it);
+}
+
+void VP8IteratorReset(VP8EncIterator* const it) {
+ VP8Encoder* const enc = it->enc_;
+ VP8IteratorSetRow(it, 0);
+ VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_); // default
+ InitTop(it);
+ memset(it->bit_count_, 0, sizeof(it->bit_count_));
+ it->do_trellis_ = 0;
+}
+
+void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
+ it->count_down_ = it->count_down0_ = count_down;
+}
+
+int VP8IteratorIsDone(const VP8EncIterator* const it) {
+ return (it->count_down_ <= 0);
+}
+
+void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
+ it->enc_ = enc;
+ it->yuv_in_ = (uint8_t*)WEBP_ALIGN(it->yuv_mem_);
+ it->yuv_out_ = it->yuv_in_ + YUV_SIZE_ENC;
+ it->yuv_out2_ = it->yuv_out_ + YUV_SIZE_ENC;
+ it->yuv_p_ = it->yuv_out2_ + YUV_SIZE_ENC;
+ it->lf_stats_ = enc->lf_stats_;
+ it->percent0_ = enc->percent_;
+ it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
+ it->u_left_ = it->y_left_ + 16 + 16;
+ it->v_left_ = it->u_left_ + 16;
+ it->top_derr_ = enc->top_derr_;
+ VP8IteratorReset(it);
+}
+
+int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
+ VP8Encoder* const enc = it->enc_;
+ if (delta && enc->pic_->progress_hook != NULL) {
+ const int done = it->count_down0_ - it->count_down_;
+ const int percent = (it->count_down0_ <= 0)
+ ? it->percent0_
+ : it->percent0_ + delta * done / it->count_down0_;
+ return WebPReportProgress(enc->pic_, percent, &enc->percent_);
+ }
+ return 1;
+}
+
+//------------------------------------------------------------------------------
+// Import the source samples into the cache. Takes care of replicating
+// boundary pixels if necessary.
+
+static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
+
+static void ImportBlock(const uint8_t* src, int src_stride,
+ uint8_t* dst, int w, int h, int size) {
+ int i;
+ for (i = 0; i < h; ++i) {
+ memcpy(dst, src, w);
+ if (w < size) {
+ memset(dst + w, dst[w - 1], size - w);
+ }
+ dst += BPS;
+ src += src_stride;
+ }
+ for (i = h; i < size; ++i) {
+ memcpy(dst, dst - BPS, size);
+ dst += BPS;
+ }
+}
+
+static void ImportLine(const uint8_t* src, int src_stride,
+ uint8_t* dst, int len, int total_len) {
+ int i;
+ for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
+ for (; i < total_len; ++i) dst[i] = dst[len - 1];
+}
+
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* const tmp_32) {
+ const VP8Encoder* const enc = it->enc_;
+ const int x = it->x_, y = it->y_;
+ const WebPPicture* const pic = enc->pic_;
+ const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16;
+ const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
+ const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
+ const int w = MinSize(pic->width - x * 16, 16);
+ const int h = MinSize(pic->height - y * 16, 16);
+ const int uv_w = (w + 1) >> 1;
+ const int uv_h = (h + 1) >> 1;
+
+ ImportBlock(ysrc, pic->y_stride, it->yuv_in_ + Y_OFF_ENC, w, h, 16);
+ ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF_ENC, uv_w, uv_h, 8);
+ ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF_ENC, uv_w, uv_h, 8);
+
+ if (tmp_32 == NULL) return;
+
+ // Import source (uncompressed) samples into boundary.
+ if (x == 0) {
+ InitLeft(it);
+ } else {
+ if (y == 0) {
+ it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
+ } else {
+ it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
+ it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
+ it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
+ }
+ ImportLine(ysrc - 1, pic->y_stride, it->y_left_, h, 16);
+ ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
+ ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
+ }
+
+ it->y_top_ = tmp_32 + 0;
+ it->uv_top_ = tmp_32 + 16;
+ if (y == 0) {
+ memset(tmp_32, 127, 32 * sizeof(*tmp_32));
+ } else {
+ ImportLine(ysrc - pic->y_stride, 1, tmp_32, w, 16);
+ ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16, uv_w, 8);
+ ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Copy back the compressed samples into user space if requested.
+
+static void ExportBlock(const uint8_t* src, uint8_t* dst, int dst_stride,
+ int w, int h) {
+ while (h-- > 0) {
+ memcpy(dst, src, w);
+ dst += dst_stride;
+ src += BPS;
+ }
+}
+
+void VP8IteratorExport(const VP8EncIterator* const it) {
+ const VP8Encoder* const enc = it->enc_;
+ if (enc->config_->show_compressed) {
+ const int x = it->x_, y = it->y_;
+ const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+ const uint8_t* const usrc = it->yuv_out_ + U_OFF_ENC;
+ const uint8_t* const vsrc = it->yuv_out_ + V_OFF_ENC;
+ const WebPPicture* const pic = enc->pic_;
+ uint8_t* const ydst = pic->y + (y * pic->y_stride + x) * 16;
+ uint8_t* const udst = pic->u + (y * pic->uv_stride + x) * 8;
+ uint8_t* const vdst = pic->v + (y * pic->uv_stride + x) * 8;
+ int w = (pic->width - x * 16);
+ int h = (pic->height - y * 16);
+
+ if (w > 16) w = 16;
+ if (h > 16) h = 16;
+
+ // Luma plane
+ ExportBlock(ysrc, ydst, pic->y_stride, w, h);
+
+ { // U/V planes
+ const int uv_w = (w + 1) >> 1;
+ const int uv_h = (h + 1) >> 1;
+ ExportBlock(usrc, udst, pic->uv_stride, uv_w, uv_h);
+ ExportBlock(vsrc, vdst, pic->uv_stride, uv_w, uv_h);
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// Non-zero contexts setup/teardown
+
+// Nz bits:
+// 0 1 2 3 Y
+// 4 5 6 7
+// 8 9 10 11
+// 12 13 14 15
+// 16 17 U
+// 18 19
+// 20 21 V
+// 22 23
+// 24 DC-intra16
+
+// Convert packed context to byte array
+#define BIT(nz, n) (!!((nz) & (1 << (n))))
+
+void VP8IteratorNzToBytes(VP8EncIterator* const it) {
+ const int tnz = it->nz_[0], lnz = it->nz_[-1];
+ int* const top_nz = it->top_nz_;
+ int* const left_nz = it->left_nz_;
+
+ // Top-Y
+ top_nz[0] = BIT(tnz, 12);
+ top_nz[1] = BIT(tnz, 13);
+ top_nz[2] = BIT(tnz, 14);
+ top_nz[3] = BIT(tnz, 15);
+ // Top-U
+ top_nz[4] = BIT(tnz, 18);
+ top_nz[5] = BIT(tnz, 19);
+ // Top-V
+ top_nz[6] = BIT(tnz, 22);
+ top_nz[7] = BIT(tnz, 23);
+ // DC
+ top_nz[8] = BIT(tnz, 24);
+
+ // left-Y
+ left_nz[0] = BIT(lnz, 3);
+ left_nz[1] = BIT(lnz, 7);
+ left_nz[2] = BIT(lnz, 11);
+ left_nz[3] = BIT(lnz, 15);
+ // left-U
+ left_nz[4] = BIT(lnz, 17);
+ left_nz[5] = BIT(lnz, 19);
+ // left-V
+ left_nz[6] = BIT(lnz, 21);
+ left_nz[7] = BIT(lnz, 23);
+ // left-DC is special, iterated separately
+}
+
+void VP8IteratorBytesToNz(VP8EncIterator* const it) {
+ uint32_t nz = 0;
+ const int* const top_nz = it->top_nz_;
+ const int* const left_nz = it->left_nz_;
+ // top
+ nz |= (top_nz[0] << 12) | (top_nz[1] << 13);
+ nz |= (top_nz[2] << 14) | (top_nz[3] << 15);
+ nz |= (top_nz[4] << 18) | (top_nz[5] << 19);
+ nz |= (top_nz[6] << 22) | (top_nz[7] << 23);
+ nz |= (top_nz[8] << 24); // we propagate the _top_ bit, esp. for intra4
+ // left
+ nz |= (left_nz[0] << 3) | (left_nz[1] << 7);
+ nz |= (left_nz[2] << 11);
+ nz |= (left_nz[4] << 17) | (left_nz[6] << 21);
+
+ *it->nz_ = nz;
+}
+
+#undef BIT
+
+//------------------------------------------------------------------------------
+// Advance to the next position, doing the bookkeeping.
+
+void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
+ VP8Encoder* const enc = it->enc_;
+ const int x = it->x_, y = it->y_;
+ const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+ const uint8_t* const uvsrc = it->yuv_out_ + U_OFF_ENC;
+ if (x < enc->mb_w_ - 1) { // left
+ int i;
+ for (i = 0; i < 16; ++i) {
+ it->y_left_[i] = ysrc[15 + i * BPS];
+ }
+ for (i = 0; i < 8; ++i) {
+ it->u_left_[i] = uvsrc[7 + i * BPS];
+ it->v_left_[i] = uvsrc[15 + i * BPS];
+ }
+ // top-left (before 'top'!)
+ it->y_left_[-1] = it->y_top_[15];
+ it->u_left_[-1] = it->uv_top_[0 + 7];
+ it->v_left_[-1] = it->uv_top_[8 + 7];
+ }
+ if (y < enc->mb_h_ - 1) { // top
+ memcpy(it->y_top_, ysrc + 15 * BPS, 16);
+ memcpy(it->uv_top_, uvsrc + 7 * BPS, 8 + 8);
+ }
+}
+
+int VP8IteratorNext(VP8EncIterator* const it) {
+ if (++it->x_ == it->enc_->mb_w_) {
+ VP8IteratorSetRow(it, ++it->y_);
+ } else {
+ it->preds_ += 4;
+ it->mb_ += 1;
+ it->nz_ += 1;
+ it->y_top_ += 16;
+ it->uv_top_ += 16;
+ }
+ return (0 < --it->count_down_);
+}
+
+//------------------------------------------------------------------------------
+// Helper function to set mode properties
+
+void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) {
+ uint8_t* preds = it->preds_;
+ int y;
+ for (y = 0; y < 4; ++y) {
+ memset(preds, mode, 4);
+ preds += it->enc_->preds_w_;
+ }
+ it->mb_->type_ = 1;
+}
+
+void VP8SetIntra4Mode(const VP8EncIterator* const it, const uint8_t* modes) {
+ uint8_t* preds = it->preds_;
+ int y;
+ for (y = 4; y > 0; --y) {
+ memcpy(preds, modes, 4 * sizeof(*modes));
+ preds += it->enc_->preds_w_;
+ modes += 4;
+ }
+ it->mb_->type_ = 0;
+}
+
+void VP8SetIntraUVMode(const VP8EncIterator* const it, int mode) {
+ it->mb_->uv_mode_ = mode;
+}
+
+void VP8SetSkip(const VP8EncIterator* const it, int skip) {
+ it->mb_->skip_ = skip;
+}
+
+void VP8SetSegment(const VP8EncIterator* const it, int segment) {
+ it->mb_->segment_ = segment;
+}
+
+//------------------------------------------------------------------------------
+// Intra4x4 sub-blocks iteration
+//
+// We store and update the boundary samples into an array of 37 pixels. They
+// are updated as we iterate and reconstructs each intra4x4 blocks in turn.
+// The position of the samples has the following snake pattern:
+//
+// 16|17 18 19 20|21 22 23 24|25 26 27 28|29 30 31 32|33 34 35 36 <- Top-right
+// --+-----------+-----------+-----------+-----------+
+// 15| 19| 23| 27| 31|
+// 14| 18| 22| 26| 30|
+// 13| 17| 21| 25| 29|
+// 12|13 14 15 16|17 18 19 20|21 22 23 24|25 26 27 28|
+// --+-----------+-----------+-----------+-----------+
+// 11| 15| 19| 23| 27|
+// 10| 14| 18| 22| 26|
+// 9| 13| 17| 21| 25|
+// 8| 9 10 11 12|13 14 15 16|17 18 19 20|21 22 23 24|
+// --+-----------+-----------+-----------+-----------+
+// 7| 11| 15| 19| 23|
+// 6| 10| 14| 18| 22|
+// 5| 9| 13| 17| 21|
+// 4| 5 6 7 8| 9 10 11 12|13 14 15 16|17 18 19 20|
+// --+-----------+-----------+-----------+-----------+
+// 3| 7| 11| 15| 19|
+// 2| 6| 10| 14| 18|
+// 1| 5| 9| 13| 17|
+// 0| 1 2 3 4| 5 6 7 8| 9 10 11 12|13 14 15 16|
+// --+-----------+-----------+-----------+-----------+
+
+// Array to record the position of the top sample to pass to the prediction
+// functions in dsp.c.
+static const uint8_t VP8TopLeftI4[16] = {
+ 17, 21, 25, 29,
+ 13, 17, 21, 25,
+ 9, 13, 17, 21,
+ 5, 9, 13, 17
+};
+
+void VP8IteratorStartI4(VP8EncIterator* const it) {
+ const VP8Encoder* const enc = it->enc_;
+ int i;
+
+ it->i4_ = 0; // first 4x4 sub-block
+ it->i4_top_ = it->i4_boundary_ + VP8TopLeftI4[0];
+
+ // Import the boundary samples
+ for (i = 0; i < 17; ++i) { // left
+ it->i4_boundary_[i] = it->y_left_[15 - i];
+ }
+ for (i = 0; i < 16; ++i) { // top
+ it->i4_boundary_[17 + i] = it->y_top_[i];
+ }
+ // top-right samples have a special case on the far right of the picture
+ if (it->x_ < enc->mb_w_ - 1) {
+ for (i = 16; i < 16 + 4; ++i) {
+ it->i4_boundary_[17 + i] = it->y_top_[i];
+ }
+ } else { // else, replicate the last valid pixel four times
+ for (i = 16; i < 16 + 4; ++i) {
+ it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15];
+ }
+ }
+ VP8IteratorNzToBytes(it); // import the non-zero context
+}
+
+int VP8IteratorRotateI4(VP8EncIterator* const it,
+ const uint8_t* const yuv_out) {
+ const uint8_t* const blk = yuv_out + VP8Scan[it->i4_];
+ uint8_t* const top = it->i4_top_;
+ int i;
+
+ // Update the cache with 7 fresh samples
+ for (i = 0; i <= 3; ++i) {
+ top[-4 + i] = blk[i + 3 * BPS]; // store future top samples
+ }
+ if ((it->i4_ & 3) != 3) { // if not on the right sub-blocks #3, #7, #11, #15
+ for (i = 0; i <= 2; ++i) { // store future left samples
+ top[i] = blk[3 + (2 - i) * BPS];
+ }
+ } else { // else replicate top-right samples, as says the specs.
+ for (i = 0; i <= 3; ++i) {
+ top[i] = top[i + 4];
+ }
+ }
+ // move pointers to next sub-block
+ ++it->i4_;
+ if (it->i4_ == 16) { // we're done
+ return 0;
+ }
+
+ it->i4_top_ = it->i4_boundary_ + VP8TopLeftI4[it->i4_];
+ return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/moz.build b/media/libwebp/enc/moz.build
new file mode 100644
index 0000000000..12eaf5a5ed
--- /dev/null
+++ b/media/libwebp/enc/moz.build
@@ -0,0 +1,39 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+with Files('**'):
+ BUG_COMPONENT = ('Core', 'ImageLib')
+
+SOURCES += [
+ 'alpha_enc.c',
+ 'analysis_enc.c',
+ 'backward_references_cost_enc.c',
+ 'backward_references_enc.c',
+ 'config_enc.c',
+ 'cost_enc.c',
+ 'filter_enc.c',
+ 'frame_enc.c',
+ 'histogram_enc.c',
+ 'iterator_enc.c',
+ 'near_lossless_enc.c',
+ 'picture_csp_enc.c',
+ 'picture_enc.c',
+ 'picture_psnr_enc.c',
+ 'picture_rescale_enc.c',
+ 'picture_tools_enc.c',
+ 'predictor_enc.c',
+ 'quant_enc.c',
+ 'syntax_enc.c',
+ 'token_enc.c',
+ 'tree_enc.c',
+ 'vp8l_enc.c',
+ 'webp_enc.c',
+]
+
+FINAL_LIBRARY = 'gkmedias'
+
+# We allow warnings for third-party code that can be updated from upstream.
+ALLOW_COMPILER_WARNINGS = True
diff --git a/media/libwebp/enc/near_lossless_enc.c b/media/libwebp/enc/near_lossless_enc.c
new file mode 100644
index 0000000000..1fd12a4364
--- /dev/null
+++ b/media/libwebp/enc/near_lossless_enc.c
@@ -0,0 +1,151 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Near-lossless image preprocessing adjusts pixel values to help
+// compressibility with a guarantee of maximum deviation between original and
+// resulting pixel values.
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+// Converted to C by Aleksander Kramarz (akramarz@google.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../dsp/lossless_common.h"
+#include "../utils/utils.h"
+#include "../enc/vp8li_enc.h"
+
+#if (WEBP_NEAR_LOSSLESS == 1)
+
+#define MIN_DIM_FOR_NEAR_LOSSLESS 64
+#define MAX_LIMIT_BITS 5
+
+// Quantizes the value up or down to a multiple of 1<<bits (or to 255),
+// choosing the closer one, resolving ties using bankers' rounding.
+static uint32_t FindClosestDiscretized(uint32_t a, int bits) {
+ const uint32_t mask = (1u << bits) - 1;
+ const uint32_t biased = a + (mask >> 1) + ((a >> bits) & 1);
+ assert(bits > 0);
+ if (biased > 0xff) return 0xff;
+ return biased & ~mask;
+}
+
+// Applies FindClosestDiscretized to all channels of pixel.
+static uint32_t ClosestDiscretizedArgb(uint32_t a, int bits) {
+ return
+ (FindClosestDiscretized(a >> 24, bits) << 24) |
+ (FindClosestDiscretized((a >> 16) & 0xff, bits) << 16) |
+ (FindClosestDiscretized((a >> 8) & 0xff, bits) << 8) |
+ (FindClosestDiscretized(a & 0xff, bits));
+}
+
+// Checks if distance between corresponding channel values of pixels a and b
+// is within the given limit.
+static int IsNear(uint32_t a, uint32_t b, int limit) {
+ int k;
+ for (k = 0; k < 4; ++k) {
+ const int delta =
+ (int)((a >> (k * 8)) & 0xff) - (int)((b >> (k * 8)) & 0xff);
+ if (delta >= limit || delta <= -limit) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int IsSmooth(const uint32_t* const prev_row,
+ const uint32_t* const curr_row,
+ const uint32_t* const next_row,
+ int ix, int limit) {
+ // Check that all pixels in 4-connected neighborhood are smooth.
+ return (IsNear(curr_row[ix], curr_row[ix - 1], limit) &&
+ IsNear(curr_row[ix], curr_row[ix + 1], limit) &&
+ IsNear(curr_row[ix], prev_row[ix], limit) &&
+ IsNear(curr_row[ix], next_row[ix], limit));
+}
+
+// Adjusts pixel values of image with given maximum error.
+static void NearLossless(int xsize, int ysize, const uint32_t* argb_src,
+ int stride, int limit_bits, uint32_t* copy_buffer,
+ uint32_t* argb_dst) {
+ int x, y;
+ const int limit = 1 << limit_bits;
+ uint32_t* prev_row = copy_buffer;
+ uint32_t* curr_row = prev_row + xsize;
+ uint32_t* next_row = curr_row + xsize;
+ memcpy(curr_row, argb_src, xsize * sizeof(argb_src[0]));
+ memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+
+ for (y = 0; y < ysize; ++y, argb_src += stride, argb_dst += xsize) {
+ if (y == 0 || y == ysize - 1) {
+ memcpy(argb_dst, argb_src, xsize * sizeof(argb_src[0]));
+ } else {
+ memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+ argb_dst[0] = argb_src[0];
+ argb_dst[xsize - 1] = argb_src[xsize - 1];
+ for (x = 1; x < xsize - 1; ++x) {
+ if (IsSmooth(prev_row, curr_row, next_row, x, limit)) {
+ argb_dst[x] = curr_row[x];
+ } else {
+ argb_dst[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+ }
+ }
+ }
+ {
+ // Three-way swap.
+ uint32_t* const temp = prev_row;
+ prev_row = curr_row;
+ curr_row = next_row;
+ next_row = temp;
+ }
+ }
+}
+
+int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
+ uint32_t* const argb_dst) {
+ int i;
+ const int xsize = picture->width;
+ const int ysize = picture->height;
+ const int stride = picture->argb_stride;
+ uint32_t* const copy_buffer =
+ (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
+ const int limit_bits = VP8LNearLosslessBits(quality);
+ assert(argb_dst != NULL);
+ assert(limit_bits > 0);
+ assert(limit_bits <= MAX_LIMIT_BITS);
+ if (copy_buffer == NULL) {
+ return 0;
+ }
+ // For small icon images, don't attempt to apply near-lossless compression.
+ if ((xsize < MIN_DIM_FOR_NEAR_LOSSLESS &&
+ ysize < MIN_DIM_FOR_NEAR_LOSSLESS) ||
+ ysize < 3) {
+ for (i = 0; i < ysize; ++i) {
+ memcpy(argb_dst + i * xsize, picture->argb + i * picture->argb_stride,
+ xsize * sizeof(*argb_dst));
+ }
+ WebPSafeFree(copy_buffer);
+ return 1;
+ }
+
+ NearLossless(xsize, ysize, picture->argb, stride, limit_bits, copy_buffer,
+ argb_dst);
+ for (i = limit_bits - 1; i != 0; --i) {
+ NearLossless(xsize, ysize, argb_dst, xsize, i, copy_buffer, argb_dst);
+ }
+ WebPSafeFree(copy_buffer);
+ return 1;
+}
+#else // (WEBP_NEAR_LOSSLESS == 1)
+
+// Define a stub to suppress compiler warnings.
+extern void VP8LNearLosslessStub(void);
+void VP8LNearLosslessStub(void) {}
+
+#endif // (WEBP_NEAR_LOSSLESS == 1)
diff --git a/media/libwebp/enc/picture_csp_enc.c b/media/libwebp/enc/picture_csp_enc.c
new file mode 100644
index 0000000000..3dd5d380e8
--- /dev/null
+++ b/media/libwebp/enc/picture_csp_enc.c
@@ -0,0 +1,1210 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture utils for colorspace conversion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../utils/random_utils.h"
+#include "../utils/utils.h"
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/yuv.h"
+
+// Uncomment to disable gamma-compression during RGB->U/V averaging
+#define USE_GAMMA_COMPRESSION
+
+// If defined, use table to compute x / alpha.
+#define USE_INVERSE_ALPHA_TABLE
+
+#ifdef WORDS_BIGENDIAN
+// uint32_t 0xff000000 is 0xff,00,00,00 in memory
+#define CHANNEL_OFFSET(i) (i)
+#else
+// uint32_t 0xff000000 is 0x00,00,00,ff in memory
+#define CHANNEL_OFFSET(i) (3-(i))
+#endif
+
+#define ALPHA_OFFSET CHANNEL_OFFSET(0)
+
+//------------------------------------------------------------------------------
+// Detection of non-trivial transparency
+
+// Returns true if alpha[] has non-0xff values.
+static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
+ int x_step, int y_step) {
+ if (alpha == NULL) return 0;
+ WebPInitAlphaProcessing();
+ if (x_step == 1) {
+ for (; height-- > 0; alpha += y_step) {
+ if (WebPHasAlpha8b(alpha, width)) return 1;
+ }
+ } else {
+ for (; height-- > 0; alpha += y_step) {
+ if (WebPHasAlpha32b(alpha, width)) return 1;
+ }
+ }
+ return 0;
+}
+
+// Checking for the presence of non-opaque alpha.
+int WebPPictureHasTransparency(const WebPPicture* picture) {
+ if (picture == NULL) return 0;
+ if (picture->use_argb) {
+ const int alpha_offset = ALPHA_OFFSET;
+ return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
+ picture->width, picture->height,
+ 4, picture->argb_stride * sizeof(*picture->argb));
+ }
+ return CheckNonOpaque(picture->a, picture->width, picture->height,
+ 1, picture->a_stride);
+}
+
+//------------------------------------------------------------------------------
+// Code for gamma correction
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// gamma-compensates loss of resolution during chroma subsampling
+#define kGamma 0.80 // for now we use a different gamma value than kGammaF
+#define kGammaFix 12 // fixed-point precision for linear values
+#define kGammaScale ((1 << kGammaFix) - 1)
+#define kGammaTabFix 7 // fixed-point fractional bits precision
+#define kGammaTabScale (1 << kGammaTabFix)
+#define kGammaTabRounder (kGammaTabScale >> 1)
+#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+
+static int kLinearToGammaTab[kGammaTabSize + 1];
+static uint16_t kGammaToLinearTab[256];
+static volatile int kGammaTablesOk = 0;
+static void InitGammaTables(void);
+
+WEBP_DSP_INIT_FUNC(InitGammaTables) {
+ if (!kGammaTablesOk) {
+ int v;
+ const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
+ const double norm = 1. / 255.;
+ for (v = 0; v <= 255; ++v) {
+ kGammaToLinearTab[v] =
+ (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
+ }
+ for (v = 0; v <= kGammaTabSize; ++v) {
+ kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
+ }
+ kGammaTablesOk = 1;
+ }
+}
+
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
+ return kGammaToLinearTab[v];
+}
+
+static WEBP_INLINE int Interpolate(int v) {
+ const int tab_pos = v >> (kGammaTabFix + 2); // integer part
+ const int x = v & ((kGammaTabScale << 2) - 1); // fractional part
+ const int v0 = kLinearToGammaTab[tab_pos];
+ const int v1 = kLinearToGammaTab[tab_pos + 1];
+ const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x); // interpolate
+ assert(tab_pos + 1 < kGammaTabSize + 1);
+ return y;
+}
+
+// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
+// U/V value, suitable for RGBToU/V calls.
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+ const int y = Interpolate(base_value << shift); // final uplifted value
+ return (y + kGammaTabRounder) >> kGammaTabFix; // descale
+}
+
+#else
+
+static void InitGammaTables(void) {}
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+ return (int)(base_value << shift);
+}
+
+#endif // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+
+static int RGBToY(int r, int g, int b, VP8Random* const rg) {
+ return (rg == NULL) ? VP8RGBToY(r, g, b, YUV_HALF)
+ : VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
+}
+
+static int RGBToU(int r, int g, int b, VP8Random* const rg) {
+ return (rg == NULL) ? VP8RGBToU(r, g, b, YUV_HALF << 2)
+ : VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+static int RGBToV(int r, int g, int b, VP8Random* const rg) {
+ return (rg == NULL) ? VP8RGBToV(r, g, b, YUV_HALF << 2)
+ : VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+//------------------------------------------------------------------------------
+// Sharp RGB->YUV conversion
+
+static const int kNumIterations = 4;
+static const int kMinDimensionIterativeConversion = 4;
+
+// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
+// banding sometimes. Better use extra precision.
+#define SFIX 2 // fixed-point precision of RGB and Y/W
+typedef int16_t fixed_t; // signed type with extra SFIX precision for UV
+typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W
+
+#define SHALF (1 << SFIX >> 1)
+#define MAX_Y_T ((256 << SFIX) - 1)
+#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// We use tables of different size and precision for the Rec709 / BT2020
+// transfer function.
+#define kGammaF (1./0.45)
+static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
+#define GAMMA_TO_LINEAR_BITS 14
+static uint32_t kGammaToLinearTabS[MAX_Y_T + 1]; // size scales with Y_FIX
+static volatile int kGammaTablesSOk = 0;
+static void InitGammaTablesS(void);
+
+WEBP_DSP_INIT_FUNC(InitGammaTablesS) {
+ assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values
+ if (!kGammaTablesSOk) {
+ int v;
+ const double norm = 1. / MAX_Y_T;
+ const double scale = 1. / kGammaTabSize;
+ const double a = 0.09929682680944;
+ const double thresh = 0.018053968510807;
+ const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
+ for (v = 0; v <= MAX_Y_T; ++v) {
+ const double g = norm * v;
+ double value;
+ if (g <= thresh * 4.5) {
+ value = g / 4.5;
+ } else {
+ const double a_rec = 1. / (1. + a);
+ value = pow(a_rec * (g + a), kGammaF);
+ }
+ kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
+ }
+ for (v = 0; v <= kGammaTabSize; ++v) {
+ const double g = scale * v;
+ double value;
+ if (g <= thresh) {
+ value = 4.5 * g;
+ } else {
+ value = (1. + a) * pow(g, 1. / kGammaF) - a;
+ }
+ // we already incorporate the 1/2 rounding constant here
+ kLinearToGammaTabS[v] =
+ (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
+ }
+ // to prevent small rounding errors to cause read-overflow:
+ kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
+ kGammaTablesSOk = 1;
+ }
+}
+
+// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+ return kGammaToLinearTabS[v];
+}
+
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+ // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
+ const uint32_t v = value * kGammaTabSize;
+ const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
+ // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
+ const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part
+ // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
+ const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
+ const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
+ // Final interpolation. Note that rounding is already included.
+ const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
+ const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
+ return result;
+}
+
+#else
+
+static void InitGammaTablesS(void) {}
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+ return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
+}
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+ return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
+}
+
+#endif // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+
+static uint8_t clip_8b(fixed_t v) {
+ return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static fixed_y_t clip_y(int y) {
+ return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int r, int g, int b) {
+ const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
+ return (luma >> YUV_FIX);
+}
+
+static uint32_t ScaleDown(int a, int b, int c, int d) {
+ const uint32_t A = GammaToLinearS(a);
+ const uint32_t B = GammaToLinearS(b);
+ const uint32_t C = GammaToLinearS(c);
+ const uint32_t D = GammaToLinearS(d);
+ return LinearToGammaS((A + B + C + D + 2) >> 2);
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
+ int i;
+ for (i = 0; i < w; ++i) {
+ const uint32_t R = GammaToLinearS(src[0 * w + i]);
+ const uint32_t G = GammaToLinearS(src[1 * w + i]);
+ const uint32_t B = GammaToLinearS(src[2 * w + i]);
+ const uint32_t Y = RGBToGray(R, G, B);
+ dst[i] = (fixed_y_t)LinearToGammaS(Y);
+ }
+}
+
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+ fixed_t* dst, int uv_w) {
+ int i;
+ for (i = 0; i < uv_w; ++i) {
+ const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
+ src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
+ const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
+ src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
+ const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
+ src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
+ const int W = RGBToGray(r, g, b);
+ dst[0 * uv_w] = (fixed_t)(r - W);
+ dst[1 * uv_w] = (fixed_t)(g - W);
+ dst[2 * uv_w] = (fixed_t)(b - W);
+ dst += 1;
+ src1 += 2;
+ src2 += 2;
+ }
+}
+
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+ int i;
+ for (i = 0; i < w; ++i) {
+ y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
+ }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
+ const int v0 = (A * 3 + B + 2) >> 2;
+ return clip_y(v0 + W0);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t UpLift(uint8_t a) { // 8bit -> SFIX
+ return ((fixed_y_t)a << SFIX) | SHALF;
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+ const uint8_t* const g_ptr,
+ const uint8_t* const b_ptr,
+ int step,
+ int pic_width,
+ fixed_y_t* const dst) {
+ int i;
+ const int w = (pic_width + 1) & ~1;
+ for (i = 0; i < pic_width; ++i) {
+ const int off = i * step;
+ dst[i + 0 * w] = UpLift(r_ptr[off]);
+ dst[i + 1 * w] = UpLift(g_ptr[off]);
+ dst[i + 2 * w] = UpLift(b_ptr[off]);
+ }
+ if (pic_width & 1) { // replicate rightmost pixel
+ dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+ dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+ dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
+ }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+ const fixed_t* prev_uv,
+ const fixed_t* cur_uv,
+ const fixed_t* next_uv,
+ int w,
+ fixed_y_t* out1,
+ fixed_y_t* out2) {
+ const int uv_w = w >> 1;
+ const int len = (w - 1) >> 1; // length to filter
+ int k = 3;
+ while (k-- > 0) { // process each R/G/B segments in turn
+ // special boundary case for i==0
+ out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
+ out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
+
+ WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
+ WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
+
+ // special boundary case for i == w - 1 when w is even
+ if (!(w & 1)) {
+ out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+ best_y[w - 1 + 0]);
+ out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+ best_y[w - 1 + w]);
+ }
+ out1 += w;
+ out2 += w;
+ prev_uv += uv_w;
+ cur_uv += uv_w;
+ next_uv += uv_w;
+ }
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
+ const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
+ return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
+ const int u = -9719 * r - 19081 * g + 28800 * b + SROUNDER;
+ return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
+ const int v = +28800 * r - 24116 * g - 4684 * b + SROUNDER;
+ return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
+ WebPPicture* const picture) {
+ int i, j;
+ uint8_t* dst_y = picture->y;
+ uint8_t* dst_u = picture->u;
+ uint8_t* dst_v = picture->v;
+ const fixed_t* const best_uv_base = best_uv;
+ const int w = (picture->width + 1) & ~1;
+ const int h = (picture->height + 1) & ~1;
+ const int uv_w = w >> 1;
+ const int uv_h = h >> 1;
+ for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
+ for (i = 0; i < picture->width; ++i) {
+ const int off = (i >> 1);
+ const int W = best_y[i];
+ const int r = best_uv[off + 0 * uv_w] + W;
+ const int g = best_uv[off + 1 * uv_w] + W;
+ const int b = best_uv[off + 2 * uv_w] + W;
+ dst_y[i] = ConvertRGBToY(r, g, b);
+ }
+ best_y += w;
+ best_uv += (j & 1) * 3 * uv_w;
+ dst_y += picture->y_stride;
+ }
+ for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
+ for (i = 0; i < uv_w; ++i) {
+ const int off = i;
+ const int r = best_uv[off + 0 * uv_w];
+ const int g = best_uv[off + 1 * uv_w];
+ const int b = best_uv[off + 2 * uv_w];
+ dst_u[i] = ConvertRGBToU(r, g, b);
+ dst_v[i] = ConvertRGBToV(r, g, b);
+ }
+ best_uv += 3 * uv_w;
+ dst_u += picture->uv_stride;
+ dst_v += picture->uv_stride;
+ }
+ return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
+
+static int PreprocessARGB(const uint8_t* r_ptr,
+ const uint8_t* g_ptr,
+ const uint8_t* b_ptr,
+ int step, int rgb_stride,
+ WebPPicture* const picture) {
+ // we expand the right/bottom border if needed
+ const int w = (picture->width + 1) & ~1;
+ const int h = (picture->height + 1) & ~1;
+ const int uv_w = w >> 1;
+ const int uv_h = h >> 1;
+ uint64_t prev_diff_y_sum = ~0;
+ int j, iter;
+
+ // TODO(skal): allocate one big memory chunk. But for now, it's easier
+ // for valgrind debugging to have several chunks.
+ fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t); // scratch
+ fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+ fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+ fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+ fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+ fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+ fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+ fixed_y_t* best_y = best_y_base;
+ fixed_y_t* target_y = target_y_base;
+ fixed_t* best_uv = best_uv_base;
+ fixed_t* target_uv = target_uv_base;
+ const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
+ int ok;
+
+ if (best_y_base == NULL || best_uv_base == NULL ||
+ target_y_base == NULL || target_uv_base == NULL ||
+ best_rgb_y == NULL || best_rgb_uv == NULL ||
+ tmp_buffer == NULL) {
+ ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+ goto End;
+ }
+ assert(picture->width >= kMinDimensionIterativeConversion);
+ assert(picture->height >= kMinDimensionIterativeConversion);
+
+ WebPInitConvertARGBToYUV();
+
+ // Import RGB samples to W/RGB representation.
+ for (j = 0; j < picture->height; j += 2) {
+ const int is_last_row = (j == picture->height - 1);
+ fixed_y_t* const src1 = tmp_buffer + 0 * w;
+ fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+ // prepare two rows of input
+ ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
+ if (!is_last_row) {
+ ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
+ step, picture->width, src2);
+ } else {
+ memcpy(src2, src1, 3 * w * sizeof(*src2));
+ }
+ StoreGray(src1, best_y + 0, w);
+ StoreGray(src2, best_y + w, w);
+
+ UpdateW(src1, target_y, w);
+ UpdateW(src2, target_y + w, w);
+ UpdateChroma(src1, src2, target_uv, uv_w);
+ memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+ best_y += 2 * w;
+ best_uv += 3 * uv_w;
+ target_y += 2 * w;
+ target_uv += 3 * uv_w;
+ r_ptr += 2 * rgb_stride;
+ g_ptr += 2 * rgb_stride;
+ b_ptr += 2 * rgb_stride;
+ }
+
+ // Iterate and resolve clipping conflicts.
+ for (iter = 0; iter < kNumIterations; ++iter) {
+ const fixed_t* cur_uv = best_uv_base;
+ const fixed_t* prev_uv = best_uv_base;
+ uint64_t diff_y_sum = 0;
+
+ best_y = best_y_base;
+ best_uv = best_uv_base;
+ target_y = target_y_base;
+ target_uv = target_uv_base;
+ for (j = 0; j < h; j += 2) {
+ fixed_y_t* const src1 = tmp_buffer + 0 * w;
+ fixed_y_t* const src2 = tmp_buffer + 3 * w;
+ {
+ const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+ InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
+ prev_uv = cur_uv;
+ cur_uv = next_uv;
+ }
+
+ UpdateW(src1, best_rgb_y + 0 * w, w);
+ UpdateW(src2, best_rgb_y + 1 * w, w);
+ UpdateChroma(src1, src2, best_rgb_uv, uv_w);
+
+ // update two rows of Y and one row of RGB
+ diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
+ WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+ best_y += 2 * w;
+ best_uv += 3 * uv_w;
+ target_y += 2 * w;
+ target_uv += 3 * uv_w;
+ }
+ // test exit condition
+ if (iter > 0) {
+ if (diff_y_sum < diff_y_threshold) break;
+ if (diff_y_sum > prev_diff_y_sum) break;
+ }
+ prev_diff_y_sum = diff_y_sum;
+ }
+ // final reconstruction
+ ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
+
+ End:
+ WebPSafeFree(best_y_base);
+ WebPSafeFree(best_uv_base);
+ WebPSafeFree(target_y_base);
+ WebPSafeFree(target_uv_base);
+ WebPSafeFree(best_rgb_y);
+ WebPSafeFree(best_rgb_uv);
+ WebPSafeFree(tmp_buffer);
+ return ok;
+}
+#undef SAFE_ALLOC
+
+//------------------------------------------------------------------------------
+// "Fast" regular RGB->YUV
+
+#define SUM4(ptr, step) LinearToGamma( \
+ GammaToLinear((ptr)[0]) + \
+ GammaToLinear((ptr)[(step)]) + \
+ GammaToLinear((ptr)[rgb_stride]) + \
+ GammaToLinear((ptr)[rgb_stride + (step)]), 0) \
+
+#define SUM2(ptr) \
+ LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
+
+#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride])
+#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4))
+
+#if defined(USE_INVERSE_ALPHA_TABLE)
+
+static const int kAlphaFix = 19;
+// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix
+// formula is then equal to v / a in most (99.6%) cases. Note that this table
+// and constant are adjusted very tightly to fit 32b arithmetic.
+// In particular, they use the fact that the operands for 'v / a' are actually
+// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
+// with ai in [0..255] and pi in [0..1<<kGammaFix). The constraint to avoid
+// overflow is: kGammaFix + kAlphaFix <= 31.
+static const uint32_t kInvAlpha[4 * 0xff + 1] = {
+ 0, /* alpha = 0 */
+ 524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536,
+ 58254, 52428, 47662, 43690, 40329, 37449, 34952, 32768,
+ 30840, 29127, 27594, 26214, 24966, 23831, 22795, 21845,
+ 20971, 20164, 19418, 18724, 18078, 17476, 16912, 16384,
+ 15887, 15420, 14979, 14563, 14169, 13797, 13443, 13107,
+ 12787, 12483, 12192, 11915, 11650, 11397, 11155, 10922,
+ 10699, 10485, 10280, 10082, 9892, 9709, 9532, 9362,
+ 9198, 9039, 8886, 8738, 8594, 8456, 8322, 8192,
+ 8065, 7943, 7825, 7710, 7598, 7489, 7384, 7281,
+ 7182, 7084, 6990, 6898, 6808, 6721, 6636, 6553,
+ 6472, 6393, 6316, 6241, 6168, 6096, 6026, 5957,
+ 5890, 5825, 5761, 5698, 5637, 5577, 5518, 5461,
+ 5405, 5349, 5295, 5242, 5190, 5140, 5090, 5041,
+ 4993, 4946, 4899, 4854, 4809, 4766, 4723, 4681,
+ 4639, 4599, 4559, 4519, 4481, 4443, 4405, 4369,
+ 4332, 4297, 4262, 4228, 4194, 4161, 4128, 4096,
+ 4064, 4032, 4002, 3971, 3942, 3912, 3883, 3855,
+ 3826, 3799, 3771, 3744, 3718, 3692, 3666, 3640,
+ 3615, 3591, 3566, 3542, 3518, 3495, 3472, 3449,
+ 3426, 3404, 3382, 3360, 3339, 3318, 3297, 3276,
+ 3256, 3236, 3216, 3196, 3177, 3158, 3139, 3120,
+ 3102, 3084, 3066, 3048, 3030, 3013, 2995, 2978,
+ 2962, 2945, 2928, 2912, 2896, 2880, 2864, 2849,
+ 2833, 2818, 2803, 2788, 2774, 2759, 2744, 2730,
+ 2716, 2702, 2688, 2674, 2661, 2647, 2634, 2621,
+ 2608, 2595, 2582, 2570, 2557, 2545, 2532, 2520,
+ 2508, 2496, 2484, 2473, 2461, 2449, 2438, 2427,
+ 2416, 2404, 2394, 2383, 2372, 2361, 2351, 2340,
+ 2330, 2319, 2309, 2299, 2289, 2279, 2269, 2259,
+ 2250, 2240, 2231, 2221, 2212, 2202, 2193, 2184,
+ 2175, 2166, 2157, 2148, 2139, 2131, 2122, 2114,
+ 2105, 2097, 2088, 2080, 2072, 2064, 2056, 2048,
+ 2040, 2032, 2024, 2016, 2008, 2001, 1993, 1985,
+ 1978, 1971, 1963, 1956, 1949, 1941, 1934, 1927,
+ 1920, 1913, 1906, 1899, 1892, 1885, 1879, 1872,
+ 1865, 1859, 1852, 1846, 1839, 1833, 1826, 1820,
+ 1814, 1807, 1801, 1795, 1789, 1783, 1777, 1771,
+ 1765, 1759, 1753, 1747, 1741, 1736, 1730, 1724,
+ 1718, 1713, 1707, 1702, 1696, 1691, 1685, 1680,
+ 1675, 1669, 1664, 1659, 1653, 1648, 1643, 1638,
+ 1633, 1628, 1623, 1618, 1613, 1608, 1603, 1598,
+ 1593, 1588, 1583, 1579, 1574, 1569, 1565, 1560,
+ 1555, 1551, 1546, 1542, 1537, 1533, 1528, 1524,
+ 1519, 1515, 1510, 1506, 1502, 1497, 1493, 1489,
+ 1485, 1481, 1476, 1472, 1468, 1464, 1460, 1456,
+ 1452, 1448, 1444, 1440, 1436, 1432, 1428, 1424,
+ 1420, 1416, 1413, 1409, 1405, 1401, 1398, 1394,
+ 1390, 1387, 1383, 1379, 1376, 1372, 1368, 1365,
+ 1361, 1358, 1354, 1351, 1347, 1344, 1340, 1337,
+ 1334, 1330, 1327, 1323, 1320, 1317, 1314, 1310,
+ 1307, 1304, 1300, 1297, 1294, 1291, 1288, 1285,
+ 1281, 1278, 1275, 1272, 1269, 1266, 1263, 1260,
+ 1257, 1254, 1251, 1248, 1245, 1242, 1239, 1236,
+ 1233, 1230, 1227, 1224, 1222, 1219, 1216, 1213,
+ 1210, 1208, 1205, 1202, 1199, 1197, 1194, 1191,
+ 1188, 1186, 1183, 1180, 1178, 1175, 1172, 1170,
+ 1167, 1165, 1162, 1159, 1157, 1154, 1152, 1149,
+ 1147, 1144, 1142, 1139, 1137, 1134, 1132, 1129,
+ 1127, 1125, 1122, 1120, 1117, 1115, 1113, 1110,
+ 1108, 1106, 1103, 1101, 1099, 1096, 1094, 1092,
+ 1089, 1087, 1085, 1083, 1081, 1078, 1076, 1074,
+ 1072, 1069, 1067, 1065, 1063, 1061, 1059, 1057,
+ 1054, 1052, 1050, 1048, 1046, 1044, 1042, 1040,
+ 1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024,
+ 1022, 1020, 1018, 1016, 1014, 1012, 1010, 1008,
+ 1006, 1004, 1002, 1000, 998, 996, 994, 992,
+ 991, 989, 987, 985, 983, 981, 979, 978,
+ 976, 974, 972, 970, 969, 967, 965, 963,
+ 961, 960, 958, 956, 954, 953, 951, 949,
+ 948, 946, 944, 942, 941, 939, 937, 936,
+ 934, 932, 931, 929, 927, 926, 924, 923,
+ 921, 919, 918, 916, 914, 913, 911, 910,
+ 908, 907, 905, 903, 902, 900, 899, 897,
+ 896, 894, 893, 891, 890, 888, 887, 885,
+ 884, 882, 881, 879, 878, 876, 875, 873,
+ 872, 870, 869, 868, 866, 865, 863, 862,
+ 860, 859, 858, 856, 855, 853, 852, 851,
+ 849, 848, 846, 845, 844, 842, 841, 840,
+ 838, 837, 836, 834, 833, 832, 830, 829,
+ 828, 826, 825, 824, 823, 821, 820, 819,
+ 817, 816, 815, 814, 812, 811, 810, 809,
+ 807, 806, 805, 804, 802, 801, 800, 799,
+ 798, 796, 795, 794, 793, 791, 790, 789,
+ 788, 787, 786, 784, 783, 782, 781, 780,
+ 779, 777, 776, 775, 774, 773, 772, 771,
+ 769, 768, 767, 766, 765, 764, 763, 762,
+ 760, 759, 758, 757, 756, 755, 754, 753,
+ 752, 751, 750, 748, 747, 746, 745, 744,
+ 743, 742, 741, 740, 739, 738, 737, 736,
+ 735, 734, 733, 732, 731, 730, 729, 728,
+ 727, 726, 725, 724, 723, 722, 721, 720,
+ 719, 718, 717, 716, 715, 714, 713, 712,
+ 711, 710, 709, 708, 707, 706, 705, 704,
+ 703, 702, 701, 700, 699, 699, 698, 697,
+ 696, 695, 694, 693, 692, 691, 690, 689,
+ 688, 688, 687, 686, 685, 684, 683, 682,
+ 681, 680, 680, 679, 678, 677, 676, 675,
+ 674, 673, 673, 672, 671, 670, 669, 668,
+ 667, 667, 666, 665, 664, 663, 662, 661,
+ 661, 660, 659, 658, 657, 657, 656, 655,
+ 654, 653, 652, 652, 651, 650, 649, 648,
+ 648, 647, 646, 645, 644, 644, 643, 642,
+ 641, 640, 640, 639, 638, 637, 637, 636,
+ 635, 634, 633, 633, 632, 631, 630, 630,
+ 629, 628, 627, 627, 626, 625, 624, 624,
+ 623, 622, 621, 621, 620, 619, 618, 618,
+ 617, 616, 616, 615, 614, 613, 613, 612,
+ 611, 611, 610, 609, 608, 608, 607, 606,
+ 606, 605, 604, 604, 603, 602, 601, 601,
+ 600, 599, 599, 598, 597, 597, 596, 595,
+ 595, 594, 593, 593, 592, 591, 591, 590,
+ 589, 589, 588, 587, 587, 586, 585, 585,
+ 584, 583, 583, 582, 581, 581, 580, 579,
+ 579, 578, 578, 577, 576, 576, 575, 574,
+ 574, 573, 572, 572, 571, 571, 570, 569,
+ 569, 568, 568, 567, 566, 566, 565, 564,
+ 564, 563, 563, 562, 561, 561, 560, 560,
+ 559, 558, 558, 557, 557, 556, 555, 555,
+ 554, 554, 553, 553, 552, 551, 551, 550,
+ 550, 549, 548, 548, 547, 547, 546, 546,
+ 545, 544, 544, 543, 543, 542, 542, 541,
+ 541, 540, 539, 539, 538, 538, 537, 537,
+ 536, 536, 535, 534, 534, 533, 533, 532,
+ 532, 531, 531, 530, 530, 529, 529, 528,
+ 527, 527, 526, 526, 525, 525, 524, 524,
+ 523, 523, 522, 522, 521, 521, 520, 520,
+ 519, 519, 518, 518, 517, 517, 516, 516,
+ 515, 515, 514, 514
+};
+
+// Note that LinearToGamma() expects the values to be premultiplied by 4,
+// so we incorporate this factor 4 inside the DIVIDE_BY_ALPHA macro directly.
+#define DIVIDE_BY_ALPHA(sum, a) (((sum) * kInvAlpha[(a)]) >> (kAlphaFix - 2))
+
+#else
+
+#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a))
+
+#endif // USE_INVERSE_ALPHA_TABLE
+
+static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src,
+ const uint8_t* a_ptr,
+ uint32_t total_a, int step,
+ int rgb_stride) {
+ const uint32_t sum =
+ a_ptr[0] * GammaToLinear(src[0]) +
+ a_ptr[step] * GammaToLinear(src[step]) +
+ a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) +
+ a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]);
+ assert(total_a > 0 && total_a <= 4 * 0xff);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+ assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32));
+#endif
+ return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0);
+}
+
+static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
+ const uint8_t* const g_ptr,
+ const uint8_t* const b_ptr,
+ int step,
+ uint8_t* const dst_y,
+ int width,
+ VP8Random* const rg) {
+ int i, j;
+ for (i = 0, j = 0; i < width; i += 1, j += step) {
+ dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg);
+ }
+}
+
+static WEBP_INLINE void AccumulateRGBA(const uint8_t* const r_ptr,
+ const uint8_t* const g_ptr,
+ const uint8_t* const b_ptr,
+ const uint8_t* const a_ptr,
+ int rgb_stride,
+ uint16_t* dst, int width) {
+ int i, j;
+ // we loop over 2x2 blocks and produce one R/G/B/A value for each.
+ for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * 4, dst += 4) {
+ const uint32_t a = SUM4ALPHA(a_ptr + j);
+ int r, g, b;
+ if (a == 4 * 0xff || a == 0) {
+ r = SUM4(r_ptr + j, 4);
+ g = SUM4(g_ptr + j, 4);
+ b = SUM4(b_ptr + j, 4);
+ } else {
+ r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride);
+ g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
+ b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
+ }
+ dst[0] = r;
+ dst[1] = g;
+ dst[2] = b;
+ dst[3] = a;
+ }
+ if (width & 1) {
+ const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
+ int r, g, b;
+ if (a == 4 * 0xff || a == 0) {
+ r = SUM2(r_ptr + j);
+ g = SUM2(g_ptr + j);
+ b = SUM2(b_ptr + j);
+ } else {
+ r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride);
+ g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
+ b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
+ }
+ dst[0] = r;
+ dst[1] = g;
+ dst[2] = b;
+ dst[3] = a;
+ }
+}
+
+static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr,
+ const uint8_t* const g_ptr,
+ const uint8_t* const b_ptr,
+ int step, int rgb_stride,
+ uint16_t* dst, int width) {
+ int i, j;
+ for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * step, dst += 4) {
+ dst[0] = SUM4(r_ptr + j, step);
+ dst[1] = SUM4(g_ptr + j, step);
+ dst[2] = SUM4(b_ptr + j, step);
+ }
+ if (width & 1) {
+ dst[0] = SUM2(r_ptr + j);
+ dst[1] = SUM2(g_ptr + j);
+ dst[2] = SUM2(b_ptr + j);
+ }
+}
+
+static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
+ uint8_t* const dst_u,
+ uint8_t* const dst_v,
+ int width,
+ VP8Random* const rg) {
+ int i;
+ for (i = 0; i < width; i += 1, rgb += 4) {
+ const int r = rgb[0], g = rgb[1], b = rgb[2];
+ dst_u[i] = RGBToU(r, g, b, rg);
+ dst_v[i] = RGBToV(r, g, b, rg);
+ }
+}
+
+static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
+ const uint8_t* g_ptr,
+ const uint8_t* b_ptr,
+ const uint8_t* a_ptr,
+ int step, // bytes per pixel
+ int rgb_stride, // bytes per scanline
+ float dithering,
+ int use_iterative_conversion,
+ WebPPicture* const picture) {
+ int y;
+ const int width = picture->width;
+ const int height = picture->height;
+ const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+ const int is_rgb = (r_ptr < b_ptr); // otherwise it's bgr
+
+ picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
+ picture->use_argb = 0;
+
+ // disable smart conversion if source is too small (overkill).
+ if (width < kMinDimensionIterativeConversion ||
+ height < kMinDimensionIterativeConversion) {
+ use_iterative_conversion = 0;
+ }
+
+ if (!WebPPictureAllocYUVA(picture, width, height)) {
+ return 0;
+ }
+ if (has_alpha) {
+ assert(step == 4);
+#if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
+ assert(kAlphaFix + kGammaFix <= 31);
+#endif
+ }
+
+ if (use_iterative_conversion) {
+ InitGammaTablesS();
+ if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
+ return 0;
+ }
+ if (has_alpha) {
+ WebPExtractAlpha(a_ptr, rgb_stride, width, height,
+ picture->a, picture->a_stride);
+ }
+ } else {
+ const int uv_width = (width + 1) >> 1;
+ int use_dsp = (step == 3); // use special function in this case
+ // temporary storage for accumulated R/G/B values during conversion to U/V
+ uint16_t* const tmp_rgb =
+ (uint16_t*)WebPSafeMalloc(4 * uv_width, sizeof(*tmp_rgb));
+ uint8_t* dst_y = picture->y;
+ uint8_t* dst_u = picture->u;
+ uint8_t* dst_v = picture->v;
+ uint8_t* dst_a = picture->a;
+
+ VP8Random base_rg;
+ VP8Random* rg = NULL;
+ if (dithering > 0.) {
+ VP8InitRandom(&base_rg, dithering);
+ rg = &base_rg;
+ use_dsp = 0; // can't use dsp in this case
+ }
+ WebPInitConvertARGBToYUV();
+ InitGammaTables();
+
+ if (tmp_rgb == NULL) return 0; // malloc error
+
+ // Downsample Y/U/V planes, two rows at a time
+ for (y = 0; y < (height >> 1); ++y) {
+ int rows_have_alpha = has_alpha;
+ if (use_dsp) {
+ if (is_rgb) {
+ WebPConvertRGB24ToY(r_ptr, dst_y, width);
+ WebPConvertRGB24ToY(r_ptr + rgb_stride,
+ dst_y + picture->y_stride, width);
+ } else {
+ WebPConvertBGR24ToY(b_ptr, dst_y, width);
+ WebPConvertBGR24ToY(b_ptr + rgb_stride,
+ dst_y + picture->y_stride, width);
+ }
+ } else {
+ ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+ ConvertRowToY(r_ptr + rgb_stride,
+ g_ptr + rgb_stride,
+ b_ptr + rgb_stride, step,
+ dst_y + picture->y_stride, width, rg);
+ }
+ dst_y += 2 * picture->y_stride;
+ if (has_alpha) {
+ rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2,
+ dst_a, picture->a_stride);
+ dst_a += 2 * picture->a_stride;
+ }
+ // Collect averaged R/G/B(/A)
+ if (!rows_have_alpha) {
+ AccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width);
+ } else {
+ AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, width);
+ }
+ // Convert to U/V
+ if (rg == NULL) {
+ WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
+ } else {
+ ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
+ }
+ dst_u += picture->uv_stride;
+ dst_v += picture->uv_stride;
+ r_ptr += 2 * rgb_stride;
+ b_ptr += 2 * rgb_stride;
+ g_ptr += 2 * rgb_stride;
+ if (has_alpha) a_ptr += 2 * rgb_stride;
+ }
+ if (height & 1) { // extra last row
+ int row_has_alpha = has_alpha;
+ if (use_dsp) {
+ if (r_ptr < b_ptr) {
+ WebPConvertRGB24ToY(r_ptr, dst_y, width);
+ } else {
+ WebPConvertBGR24ToY(b_ptr, dst_y, width);
+ }
+ } else {
+ ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+ }
+ if (row_has_alpha) {
+ row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0);
+ }
+ // Collect averaged R/G/B(/A)
+ if (!row_has_alpha) {
+ // Collect averaged R/G/B
+ AccumulateRGB(r_ptr, g_ptr, b_ptr, step, /* rgb_stride = */ 0,
+ tmp_rgb, width);
+ } else {
+ AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /* rgb_stride = */ 0,
+ tmp_rgb, width);
+ }
+ if (rg == NULL) {
+ WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
+ } else {
+ ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
+ }
+ }
+ WebPSafeFree(tmp_rgb);
+ }
+ return 1;
+}
+
+#undef SUM4
+#undef SUM2
+#undef SUM4ALPHA
+#undef SUM2ALPHA
+
+//------------------------------------------------------------------------------
+// call for ARGB->YUVA conversion
+
+static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
+ float dithering, int use_iterative_conversion) {
+ if (picture == NULL) return 0;
+ if (picture->argb == NULL) {
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+ } else if ((colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+ } else {
+ const uint8_t* const argb = (const uint8_t*)picture->argb;
+ const uint8_t* const a = argb + CHANNEL_OFFSET(0);
+ const uint8_t* const r = argb + CHANNEL_OFFSET(1);
+ const uint8_t* const g = argb + CHANNEL_OFFSET(2);
+ const uint8_t* const b = argb + CHANNEL_OFFSET(3);
+
+ picture->colorspace = WEBP_YUV420;
+ return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
+ dithering, use_iterative_conversion, picture);
+ }
+}
+
+int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
+ float dithering) {
+ return PictureARGBToYUVA(picture, colorspace, dithering, 0);
+}
+
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+ return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
+}
+
+int WebPPictureSharpARGBToYUVA(WebPPicture* picture) {
+ return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
+}
+// for backward compatibility
+int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+ return WebPPictureSharpARGBToYUVA(picture);
+}
+
+//------------------------------------------------------------------------------
+// call for YUVA -> ARGB conversion
+
+int WebPPictureYUVAToARGB(WebPPicture* picture) {
+ if (picture == NULL) return 0;
+ if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+ }
+ if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+ }
+ if ((picture->colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+ }
+ // Allocate a new argb buffer (discarding the previous one).
+ if (!WebPPictureAllocARGB(picture, picture->width, picture->height)) return 0;
+ picture->use_argb = 1;
+
+ // Convert
+ {
+ int y;
+ const int width = picture->width;
+ const int height = picture->height;
+ const int argb_stride = 4 * picture->argb_stride;
+ uint8_t* dst = (uint8_t*)picture->argb;
+ const uint8_t* cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
+ WebPUpsampleLinePairFunc upsample =
+ WebPGetLinePairConverter(ALPHA_OFFSET > 0);
+
+ // First row, with replicated top samples.
+ upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+ cur_y += picture->y_stride;
+ dst += argb_stride;
+ // Center rows.
+ for (y = 1; y + 1 < height; y += 2) {
+ const uint8_t* const top_u = cur_u;
+ const uint8_t* const top_v = cur_v;
+ cur_u += picture->uv_stride;
+ cur_v += picture->uv_stride;
+ upsample(cur_y, cur_y + picture->y_stride, top_u, top_v, cur_u, cur_v,
+ dst, dst + argb_stride, width);
+ cur_y += 2 * picture->y_stride;
+ dst += 2 * argb_stride;
+ }
+ // Last row (if needed), with replicated bottom samples.
+ if (height > 1 && !(height & 1)) {
+ upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+ }
+ // Insert alpha values if needed, in replacement for the default 0xff ones.
+ if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
+ for (y = 0; y < height; ++y) {
+ uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
+ const uint8_t* const src = picture->a + y * picture->a_stride;
+ int x;
+ for (x = 0; x < width; ++x) {
+ argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+//------------------------------------------------------------------------------
+// automatic import / conversion
+
+static int Import(WebPPicture* const picture,
+ const uint8_t* rgb, int rgb_stride,
+ int step, int swap_rb, int import_alpha) {
+ int y;
+ // swap_rb -> b,g,r,a , !swap_rb -> r,g,b,a
+ const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
+ const uint8_t* g_ptr = rgb + 1;
+ const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
+ const int width = picture->width;
+ const int height = picture->height;
+
+ if (!picture->use_argb) {
+ const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
+ return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
+ 0.f /* no dithering */, 0, picture);
+ }
+ if (!WebPPictureAlloc(picture)) return 0;
+
+ VP8LDspInit();
+ WebPInitAlphaProcessing();
+
+ if (import_alpha) {
+ // dst[] byte order is {a,r,g,b} for big-endian, {b,g,r,a} for little endian
+ uint32_t* dst = picture->argb;
+ const int do_copy = (ALPHA_OFFSET == 3) && swap_rb;
+ assert(step == 4);
+ if (do_copy) {
+ for (y = 0; y < height; ++y) {
+ memcpy(dst, rgb, width * 4);
+ rgb += rgb_stride;
+ dst += picture->argb_stride;
+ }
+ } else {
+ for (y = 0; y < height; ++y) {
+#ifdef WORDS_BIGENDIAN
+ // BGRA or RGBA input order.
+ const uint8_t* a_ptr = rgb + 3;
+ WebPPackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+ r_ptr += rgb_stride;
+ g_ptr += rgb_stride;
+ b_ptr += rgb_stride;
+#else
+ // RGBA input order. Need to swap R and B.
+ VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
+#endif
+ rgb += rgb_stride;
+ dst += picture->argb_stride;
+ }
+ }
+ } else {
+ uint32_t* dst = picture->argb;
+ assert(step >= 3);
+ for (y = 0; y < height; ++y) {
+ WebPPackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
+ r_ptr += rgb_stride;
+ g_ptr += rgb_stride;
+ b_ptr += rgb_stride;
+ dst += picture->argb_stride;
+ }
+ }
+ return 1;
+}
+
+// Public API
+
+#if !defined(WEBP_REDUCE_CSP)
+
+int WebPPictureImportBGR(WebPPicture* picture,
+ const uint8_t* rgb, int rgb_stride) {
+ return (picture != NULL && rgb != NULL)
+ ? Import(picture, rgb, rgb_stride, 3, 1, 0)
+ : 0;
+}
+
+int WebPPictureImportBGRA(WebPPicture* picture,
+ const uint8_t* rgba, int rgba_stride) {
+ return (picture != NULL && rgba != NULL)
+ ? Import(picture, rgba, rgba_stride, 4, 1, 1)
+ : 0;
+}
+
+
+int WebPPictureImportBGRX(WebPPicture* picture,
+ const uint8_t* rgba, int rgba_stride) {
+ return (picture != NULL && rgba != NULL)
+ ? Import(picture, rgba, rgba_stride, 4, 1, 0)
+ : 0;
+}
+
+#endif // WEBP_REDUCE_CSP
+
+int WebPPictureImportRGB(WebPPicture* picture,
+ const uint8_t* rgb, int rgb_stride) {
+ return (picture != NULL && rgb != NULL)
+ ? Import(picture, rgb, rgb_stride, 3, 0, 0)
+ : 0;
+}
+
+int WebPPictureImportRGBA(WebPPicture* picture,
+ const uint8_t* rgba, int rgba_stride) {
+ return (picture != NULL && rgba != NULL)
+ ? Import(picture, rgba, rgba_stride, 4, 0, 1)
+ : 0;
+}
+
+int WebPPictureImportRGBX(WebPPicture* picture,
+ const uint8_t* rgba, int rgba_stride) {
+ return (picture != NULL && rgba != NULL)
+ ? Import(picture, rgba, rgba_stride, 4, 0, 0)
+ : 0;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/picture_enc.c b/media/libwebp/enc/picture_enc.c
new file mode 100644
index 0000000000..5275ba9ed2
--- /dev/null
+++ b/media/libwebp/enc/picture_enc.c
@@ -0,0 +1,296 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture class basis
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// WebPPicture
+//------------------------------------------------------------------------------
+
+static int DummyWriter(const uint8_t* data, size_t data_size,
+ const WebPPicture* const picture) {
+ // The following are to prevent 'unused variable' error message.
+ (void)data;
+ (void)data_size;
+ (void)picture;
+ return 1;
+}
+
+int WebPPictureInitInternal(WebPPicture* picture, int version) {
+ if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
+ return 0; // caller/system version mismatch!
+ }
+ if (picture != NULL) {
+ memset(picture, 0, sizeof(*picture));
+ picture->writer = DummyWriter;
+ WebPEncodingSetError(picture, VP8_ENC_OK);
+ }
+ return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static void WebPPictureResetBufferARGB(WebPPicture* const picture) {
+ picture->memory_argb_ = NULL;
+ picture->argb = NULL;
+ picture->argb_stride = 0;
+}
+
+static void WebPPictureResetBufferYUVA(WebPPicture* const picture) {
+ picture->memory_ = NULL;
+ picture->y = picture->u = picture->v = picture->a = NULL;
+ picture->y_stride = picture->uv_stride = 0;
+ picture->a_stride = 0;
+}
+
+void WebPPictureResetBuffers(WebPPicture* const picture) {
+ WebPPictureResetBufferARGB(picture);
+ WebPPictureResetBufferYUVA(picture);
+}
+
+int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
+ void* memory;
+ const uint64_t argb_size = (uint64_t)width * height;
+
+ assert(picture != NULL);
+
+ WebPSafeFree(picture->memory_argb_);
+ WebPPictureResetBufferARGB(picture);
+
+ if (width <= 0 || height <= 0) {
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+ }
+ // allocate a new buffer.
+ memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb));
+ if (memory == NULL) {
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+ }
+ picture->memory_argb_ = memory;
+ picture->argb = (uint32_t*)WEBP_ALIGN(memory);
+ picture->argb_stride = width;
+ return 1;
+}
+
+int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
+ const WebPEncCSP uv_csp =
+ (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
+ const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
+ const int y_stride = width;
+ const int uv_width = (int)(((int64_t)width + 1) >> 1);
+ const int uv_height = (int)(((int64_t)height + 1) >> 1);
+ const int uv_stride = uv_width;
+ int a_width, a_stride;
+ uint64_t y_size, uv_size, a_size, total_size;
+ uint8_t* mem;
+
+ assert(picture != NULL);
+
+ WebPSafeFree(picture->memory_);
+ WebPPictureResetBufferYUVA(picture);
+
+ if (uv_csp != WEBP_YUV420) {
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+ }
+
+ // alpha
+ a_width = has_alpha ? width : 0;
+ a_stride = a_width;
+ y_size = (uint64_t)y_stride * height;
+ uv_size = (uint64_t)uv_stride * uv_height;
+ a_size = (uint64_t)a_stride * height;
+
+ total_size = y_size + a_size + 2 * uv_size;
+
+ // Security and validation checks
+ if (width <= 0 || height <= 0 || // luma/alpha param error
+ uv_width <= 0 || uv_height <= 0) { // u/v param error
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+ }
+ // allocate a new buffer.
+ mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+ if (mem == NULL) {
+ return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+ }
+
+ // From now on, we're in the clear, we can no longer fail...
+ picture->memory_ = (void*)mem;
+ picture->y_stride = y_stride;
+ picture->uv_stride = uv_stride;
+ picture->a_stride = a_stride;
+
+ // TODO(skal): we could align the y/u/v planes and adjust stride.
+ picture->y = mem;
+ mem += y_size;
+
+ picture->u = mem;
+ mem += uv_size;
+ picture->v = mem;
+ mem += uv_size;
+
+ if (a_size > 0) {
+ picture->a = mem;
+ mem += a_size;
+ }
+ (void)mem; // makes the static analyzer happy
+ return 1;
+}
+
+int WebPPictureAlloc(WebPPicture* picture) {
+ if (picture != NULL) {
+ const int width = picture->width;
+ const int height = picture->height;
+
+ WebPPictureFree(picture); // erase previous buffer
+
+ if (!picture->use_argb) {
+ return WebPPictureAllocYUVA(picture, width, height);
+ } else {
+ return WebPPictureAllocARGB(picture, width, height);
+ }
+ }
+ return 1;
+}
+
+void WebPPictureFree(WebPPicture* picture) {
+ if (picture != NULL) {
+ WebPSafeFree(picture->memory_);
+ WebPSafeFree(picture->memory_argb_);
+ WebPPictureResetBuffers(picture);
+ }
+}
+
+//------------------------------------------------------------------------------
+// WebPMemoryWriter: Write-to-memory
+
+void WebPMemoryWriterInit(WebPMemoryWriter* writer) {
+ writer->mem = NULL;
+ writer->size = 0;
+ writer->max_size = 0;
+}
+
+int WebPMemoryWrite(const uint8_t* data, size_t data_size,
+ const WebPPicture* picture) {
+ WebPMemoryWriter* const w = (WebPMemoryWriter*)picture->custom_ptr;
+ uint64_t next_size;
+ if (w == NULL) {
+ return 1;
+ }
+ next_size = (uint64_t)w->size + data_size;
+ if (next_size > w->max_size) {
+ uint8_t* new_mem;
+ uint64_t next_max_size = 2ULL * w->max_size;
+ if (next_max_size < next_size) next_max_size = next_size;
+ if (next_max_size < 8192ULL) next_max_size = 8192ULL;
+ new_mem = (uint8_t*)WebPSafeMalloc(next_max_size, 1);
+ if (new_mem == NULL) {
+ return 0;
+ }
+ if (w->size > 0) {
+ memcpy(new_mem, w->mem, w->size);
+ }
+ WebPSafeFree(w->mem);
+ w->mem = new_mem;
+ // down-cast is ok, thanks to WebPSafeMalloc
+ w->max_size = (size_t)next_max_size;
+ }
+ if (data_size > 0) {
+ memcpy(w->mem + w->size, data, data_size);
+ w->size += data_size;
+ }
+ return 1;
+}
+
+void WebPMemoryWriterClear(WebPMemoryWriter* writer) {
+ if (writer != NULL) {
+ WebPSafeFree(writer->mem);
+ writer->mem = NULL;
+ writer->size = 0;
+ writer->max_size = 0;
+ }
+}
+
+//------------------------------------------------------------------------------
+// Simplest high-level calls:
+
+typedef int (*Importer)(WebPPicture* const, const uint8_t* const, int);
+
+static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
+ Importer import, float quality_factor, int lossless,
+ uint8_t** output) {
+ WebPPicture pic;
+ WebPConfig config;
+ WebPMemoryWriter wrt;
+ int ok;
+
+ if (output == NULL) return 0;
+
+ if (!WebPConfigPreset(&config, WEBP_PRESET_DEFAULT, quality_factor) ||
+ !WebPPictureInit(&pic)) {
+ return 0; // shouldn't happen, except if system installation is broken
+ }
+
+ config.lossless = !!lossless;
+ pic.use_argb = !!lossless;
+ pic.width = width;
+ pic.height = height;
+ pic.writer = WebPMemoryWrite;
+ pic.custom_ptr = &wrt;
+ WebPMemoryWriterInit(&wrt);
+
+ ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic);
+ WebPPictureFree(&pic);
+ if (!ok) {
+ WebPMemoryWriterClear(&wrt);
+ *output = NULL;
+ return 0;
+ }
+ *output = wrt.mem;
+ return wrt.size;
+}
+
+#define ENCODE_FUNC(NAME, IMPORTER) \
+size_t NAME(const uint8_t* in, int w, int h, int bps, float q, \
+ uint8_t** out) { \
+ return Encode(in, w, h, bps, IMPORTER, q, 0, out); \
+}
+
+ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
+ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
+ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
+#endif // WEBP_REDUCE_CSP
+
+#undef ENCODE_FUNC
+
+#define LOSSLESS_DEFAULT_QUALITY 70.
+#define LOSSLESS_ENCODE_FUNC(NAME, IMPORTER) \
+size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) { \
+ return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out); \
+}
+
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
+#endif // WEBP_REDUCE_CSP
+
+#undef LOSSLESS_ENCODE_FUNC
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/picture_psnr_enc.c b/media/libwebp/enc/picture_psnr_enc.c
new file mode 100644
index 0000000000..bbd32854c9
--- /dev/null
+++ b/media/libwebp/enc/picture_psnr_enc.c
@@ -0,0 +1,258 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools for measuring distortion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../webp/encode.h"
+
+#if !(defined(WEBP_DISABLE_STATS) || defined(WEBP_REDUCE_SIZE))
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "../dsp/dsp.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"
+
+typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
+ const uint8_t* ref, int ref_stride,
+ int w, int h);
+
+//------------------------------------------------------------------------------
+// local-min distortion
+//
+// For every pixel in the *reference* picture, we search for the local best
+// match in the compressed image. This is not a symmetrical measure.
+
+#define RADIUS 2 // search radius. Shouldn't be too large.
+
+static double AccumulateLSIM(const uint8_t* src, int src_stride,
+ const uint8_t* ref, int ref_stride,
+ int w, int h) {
+ int x, y;
+ double total_sse = 0.;
+ for (y = 0; y < h; ++y) {
+ const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
+ const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
+ for (x = 0; x < w; ++x) {
+ const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
+ const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
+ double best_sse = 255. * 255.;
+ const double value = (double)ref[y * ref_stride + x];
+ int i, j;
+ for (j = y_0; j < y_1; ++j) {
+ const uint8_t* const s = src + j * src_stride;
+ for (i = x_0; i < x_1; ++i) {
+ const double diff = s[i] - value;
+ const double sse = diff * diff;
+ if (sse < best_sse) best_sse = sse;
+ }
+ }
+ total_sse += best_sse;
+ }
+ }
+ return total_sse;
+}
+#undef RADIUS
+
+static double AccumulateSSE(const uint8_t* src, int src_stride,
+ const uint8_t* ref, int ref_stride,
+ int w, int h) {
+ int y;
+ double total_sse = 0.;
+ for (y = 0; y < h; ++y) {
+ total_sse += VP8AccumulateSSE(src, ref, w);
+ src += src_stride;
+ ref += ref_stride;
+ }
+ return total_sse;
+}
+
+//------------------------------------------------------------------------------
+
+static double AccumulateSSIM(const uint8_t* src, int src_stride,
+ const uint8_t* ref, int ref_stride,
+ int w, int h) {
+ const int w0 = (w < VP8_SSIM_KERNEL) ? w : VP8_SSIM_KERNEL;
+ const int w1 = w - VP8_SSIM_KERNEL - 1;
+ const int h0 = (h < VP8_SSIM_KERNEL) ? h : VP8_SSIM_KERNEL;
+ const int h1 = h - VP8_SSIM_KERNEL - 1;
+ int x, y;
+ double sum = 0.;
+ for (y = 0; y < h0; ++y) {
+ for (x = 0; x < w; ++x) {
+ sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+ }
+ }
+ for (; y < h1; ++y) {
+ for (x = 0; x < w0; ++x) {
+ sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+ }
+ for (; x < w1; ++x) {
+ const int off1 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * src_stride;
+ const int off2 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * ref_stride;
+ sum += VP8SSIMGet(src + off1, src_stride, ref + off2, ref_stride);
+ }
+ for (; x < w; ++x) {
+ sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+ }
+ }
+ for (; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+ }
+ }
+ return sum;
+}
+
+//------------------------------------------------------------------------------
+// Distortion
+
+// Max value returned in case of exact similarity.
+static const double kMinDistortion_dB = 99.;
+
+static double GetPSNR(double v, double size) {
+ return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.))
+ : kMinDistortion_dB;
+}
+
+static double GetLogSSIM(double v, double size) {
+ v = (size > 0.) ? v / size : 1.;
+ return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB;
+}
+
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+ const uint8_t* ref, size_t ref_stride,
+ int width, int height, size_t x_step,
+ int type, float* distortion, float* result) {
+ uint8_t* allocated = NULL;
+ const AccumulateFunc metric = (type == 0) ? AccumulateSSE :
+ (type == 1) ? AccumulateSSIM :
+ AccumulateLSIM;
+ if (src == NULL || ref == NULL ||
+ src_stride < x_step * width || ref_stride < x_step * width ||
+ result == NULL || distortion == NULL) {
+ return 0;
+ }
+
+ VP8SSIMDspInit();
+ if (x_step != 1) { // extract a packed plane if needed
+ int x, y;
+ uint8_t* tmp1;
+ uint8_t* tmp2;
+ allocated =
+ (uint8_t*)WebPSafeMalloc(2ULL * width * height, sizeof(*allocated));
+ if (allocated == NULL) return 0;
+ tmp1 = allocated;
+ tmp2 = tmp1 + (size_t)width * height;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; ++x) {
+ tmp1[x + y * width] = src[x * x_step + y * src_stride];
+ tmp2[x + y * width] = ref[x * x_step + y * ref_stride];
+ }
+ }
+ src = tmp1;
+ ref = tmp2;
+ }
+ *distortion = (float)metric(src, width, ref, width, width, height);
+ WebPSafeFree(allocated);
+
+ *result = (type == 1) ? (float)GetLogSSIM(*distortion, (double)width * height)
+ : (float)GetPSNR(*distortion, (double)width * height);
+ return 1;
+}
+
+#ifdef WORDS_BIGENDIAN
+#define BLUE_OFFSET 3 // uint32_t 0x000000ff is 0x00,00,00,ff in memory
+#else
+#define BLUE_OFFSET 0 // uint32_t 0x000000ff is 0xff,00,00,00 in memory
+#endif
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+ int type, float results[5]) {
+ int w, h, c;
+ int ok = 0;
+ WebPPicture p0, p1;
+ double total_size = 0., total_distortion = 0.;
+ if (src == NULL || ref == NULL ||
+ src->width != ref->width || src->height != ref->height ||
+ results == NULL) {
+ return 0;
+ }
+
+ VP8SSIMDspInit();
+ if (!WebPPictureInit(&p0) || !WebPPictureInit(&p1)) return 0;
+ w = src->width;
+ h = src->height;
+ if (!WebPPictureView(src, 0, 0, w, h, &p0)) goto Error;
+ if (!WebPPictureView(ref, 0, 0, w, h, &p1)) goto Error;
+
+ // We always measure distortion in ARGB space.
+ if (p0.use_argb == 0 && !WebPPictureYUVAToARGB(&p0)) goto Error;
+ if (p1.use_argb == 0 && !WebPPictureYUVAToARGB(&p1)) goto Error;
+ for (c = 0; c < 4; ++c) {
+ float distortion;
+ const size_t stride0 = 4 * (size_t)p0.argb_stride;
+ const size_t stride1 = 4 * (size_t)p1.argb_stride;
+ // results are reported as BGRA
+ const int offset = c ^ BLUE_OFFSET;
+ if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0,
+ (const uint8_t*)p1.argb + offset, stride1,
+ w, h, 4, type, &distortion, results + c)) {
+ goto Error;
+ }
+ total_distortion += distortion;
+ total_size += w * h;
+ }
+
+ results[4] = (type == 1) ? (float)GetLogSSIM(total_distortion, total_size)
+ : (float)GetPSNR(total_distortion, total_size);
+ ok = 1;
+
+ Error:
+ WebPPictureFree(&p0);
+ WebPPictureFree(&p1);
+ return ok;
+}
+
+#undef BLUE_OFFSET
+
+#else // defined(WEBP_DISABLE_STATS)
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+ const uint8_t* ref, size_t ref_stride,
+ int width, int height, size_t x_step,
+ int type, float* distortion, float* result) {
+ (void)src;
+ (void)src_stride;
+ (void)ref;
+ (void)ref_stride;
+ (void)width;
+ (void)height;
+ (void)x_step;
+ (void)type;
+ if (distortion == NULL || result == NULL) return 0;
+ *distortion = 0.f;
+ *result = 0.f;
+ return 1;
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+ int type, float results[5]) {
+ int i;
+ (void)src;
+ (void)ref;
+ (void)type;
+ if (results == NULL) return 0;
+ for (i = 0; i < 5; ++i) results[i] = 0.f;
+ return 1;
+}
+
+#endif // !defined(WEBP_DISABLE_STATS)
diff --git a/media/libwebp/enc/picture_rescale_enc.c b/media/libwebp/enc/picture_rescale_enc.c
new file mode 100644
index 0000000000..22d31363f0
--- /dev/null
+++ b/media/libwebp/enc/picture_rescale_enc.c
@@ -0,0 +1,316 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: copy, crop, rescaling and view.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../webp/encode.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../utils/rescaler_utils.h"
+#include "../utils/utils.h"
+
+#define HALVE(x) (((x) + 1) >> 1)
+
+// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
+// into 'dst'. Mark 'dst' as not owning any memory.
+static void PictureGrabSpecs(const WebPPicture* const src,
+ WebPPicture* const dst) {
+ assert(src != NULL && dst != NULL);
+ *dst = *src;
+ WebPPictureResetBuffers(dst);
+}
+
+//------------------------------------------------------------------------------
+
+// Adjust top-left corner to chroma sample position.
+static void SnapTopLeftPosition(const WebPPicture* const pic,
+ int* const left, int* const top) {
+ if (!pic->use_argb) {
+ *left &= ~1;
+ *top &= ~1;
+ }
+}
+
+// Adjust top-left corner and verify that the sub-rectangle is valid.
+static int AdjustAndCheckRectangle(const WebPPicture* const pic,
+ int* const left, int* const top,
+ int width, int height) {
+ SnapTopLeftPosition(pic, left, top);
+ if ((*left) < 0 || (*top) < 0) return 0;
+ if (width <= 0 || height <= 0) return 0;
+ if ((*left) + width > pic->width) return 0;
+ if ((*top) + height > pic->height) return 0;
+ return 1;
+}
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+ if (src == NULL || dst == NULL) return 0;
+ if (src == dst) return 1;
+
+ PictureGrabSpecs(src, dst);
+ if (!WebPPictureAlloc(dst)) return 0;
+
+ if (!src->use_argb) {
+ WebPCopyPlane(src->y, src->y_stride,
+ dst->y, dst->y_stride, dst->width, dst->height);
+ WebPCopyPlane(src->u, src->uv_stride, dst->u, dst->uv_stride,
+ HALVE(dst->width), HALVE(dst->height));
+ WebPCopyPlane(src->v, src->uv_stride, dst->v, dst->uv_stride,
+ HALVE(dst->width), HALVE(dst->height));
+ if (dst->a != NULL) {
+ WebPCopyPlane(src->a, src->a_stride,
+ dst->a, dst->a_stride, dst->width, dst->height);
+ }
+ } else {
+ WebPCopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
+ (uint8_t*)dst->argb, 4 * dst->argb_stride,
+ 4 * dst->width, dst->height);
+ }
+ return 1;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+ if (picture == NULL) return 0;
+ if (picture->use_argb) {
+ return (picture->memory_argb_ == NULL);
+ }
+ return (picture->memory_ == NULL);
+}
+
+int WebPPictureView(const WebPPicture* src,
+ int left, int top, int width, int height,
+ WebPPicture* dst) {
+ if (src == NULL || dst == NULL) return 0;
+
+ // verify rectangle position.
+ if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
+
+ if (src != dst) { // beware of aliasing! We don't want to leak 'memory_'.
+ PictureGrabSpecs(src, dst);
+ }
+ dst->width = width;
+ dst->height = height;
+ if (!src->use_argb) {
+ dst->y = src->y + top * src->y_stride + left;
+ dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
+ dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
+ dst->y_stride = src->y_stride;
+ dst->uv_stride = src->uv_stride;
+ if (src->a != NULL) {
+ dst->a = src->a + top * src->a_stride + left;
+ dst->a_stride = src->a_stride;
+ }
+ } else {
+ dst->argb = src->argb + top * src->argb_stride + left;
+ dst->argb_stride = src->argb_stride;
+ }
+ return 1;
+}
+
+//------------------------------------------------------------------------------
+// Picture cropping
+
+int WebPPictureCrop(WebPPicture* pic,
+ int left, int top, int width, int height) {
+ WebPPicture tmp;
+
+ if (pic == NULL) return 0;
+ if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
+
+ PictureGrabSpecs(pic, &tmp);
+ tmp.width = width;
+ tmp.height = height;
+ if (!WebPPictureAlloc(&tmp)) return 0;
+
+ if (!pic->use_argb) {
+ const int y_offset = top * pic->y_stride + left;
+ const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
+ WebPCopyPlane(pic->y + y_offset, pic->y_stride,
+ tmp.y, tmp.y_stride, width, height);
+ WebPCopyPlane(pic->u + uv_offset, pic->uv_stride,
+ tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
+ WebPCopyPlane(pic->v + uv_offset, pic->uv_stride,
+ tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
+
+ if (tmp.a != NULL) {
+ const int a_offset = top * pic->a_stride + left;
+ WebPCopyPlane(pic->a + a_offset, pic->a_stride,
+ tmp.a, tmp.a_stride, width, height);
+ }
+ } else {
+ const uint8_t* const src =
+ (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
+ WebPCopyPlane(src, pic->argb_stride * 4, (uint8_t*)tmp.argb,
+ tmp.argb_stride * 4, width * 4, height);
+ }
+ WebPPictureFree(pic);
+ *pic = tmp;
+ return 1;
+}
+
+//------------------------------------------------------------------------------
+// Simple picture rescaler
+
+static int RescalePlane(const uint8_t* src,
+ int src_width, int src_height, int src_stride,
+ uint8_t* dst,
+ int dst_width, int dst_height, int dst_stride,
+ rescaler_t* const work,
+ int num_channels) {
+ WebPRescaler rescaler;
+ int y = 0;
+ if (!WebPRescalerInit(&rescaler, src_width, src_height,
+ dst, dst_width, dst_height, dst_stride,
+ num_channels, work)) {
+ return 0;
+ }
+ while (y < src_height) {
+ y += WebPRescalerImport(&rescaler, src_height - y,
+ src + y * src_stride, src_stride);
+ WebPRescalerExport(&rescaler);
+ }
+ return 1;
+}
+
+static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
+ assert(pic->argb != NULL);
+ WebPMultARGBRows((uint8_t*)pic->argb, pic->argb_stride * sizeof(*pic->argb),
+ pic->width, pic->height, inverse);
+}
+
+static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
+ if (pic->a != NULL) {
+ WebPMultRows(pic->y, pic->y_stride, pic->a, pic->a_stride,
+ pic->width, pic->height, inverse);
+ }
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+ WebPPicture tmp;
+ int prev_width, prev_height;
+ rescaler_t* work;
+
+ if (pic == NULL) return 0;
+ prev_width = pic->width;
+ prev_height = pic->height;
+ if (!WebPRescalerGetScaledDimensions(
+ prev_width, prev_height, &width, &height)) {
+ return 0;
+ }
+
+ PictureGrabSpecs(pic, &tmp);
+ tmp.width = width;
+ tmp.height = height;
+ if (!WebPPictureAlloc(&tmp)) return 0;
+
+ if (!pic->use_argb) {
+ work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
+ if (work == NULL) {
+ WebPPictureFree(&tmp);
+ return 0;
+ }
+ // If present, we need to rescale alpha first (for AlphaMultiplyY).
+ if (pic->a != NULL) {
+ WebPInitAlphaProcessing();
+ if (!RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+ tmp.a, width, height, tmp.a_stride, work, 1)) {
+ return 0;
+ }
+ }
+
+ // We take transparency into account on the luma plane only. That's not
+ // totally exact blending, but still is a good approximation.
+ AlphaMultiplyY(pic, 0);
+ if (!RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
+ tmp.y, width, height, tmp.y_stride, work, 1) ||
+ !RescalePlane(pic->u,
+ HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+ tmp.u,
+ HALVE(width), HALVE(height), tmp.uv_stride, work, 1) ||
+ !RescalePlane(pic->v,
+ HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+ tmp.v,
+ HALVE(width), HALVE(height), tmp.uv_stride, work, 1)) {
+ return 0;
+ }
+ AlphaMultiplyY(&tmp, 1);
+ } else {
+ work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
+ if (work == NULL) {
+ WebPPictureFree(&tmp);
+ return 0;
+ }
+ // In order to correctly interpolate colors, we need to apply the alpha
+ // weighting first (black-matting), scale the RGB values, and remove
+ // the premultiplication afterward (while preserving the alpha channel).
+ WebPInitAlphaProcessing();
+ AlphaMultiplyARGB(pic, 0);
+ if (!RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
+ pic->argb_stride * 4,
+ (uint8_t*)tmp.argb, width, height,
+ tmp.argb_stride * 4, work, 4)) {
+ return 0;
+ }
+ AlphaMultiplyARGB(&tmp, 1);
+ }
+ WebPPictureFree(pic);
+ WebPSafeFree(work);
+ *pic = tmp;
+ return 1;
+}
+
+#else // defined(WEBP_REDUCE_SIZE)
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+ (void)src;
+ (void)dst;
+ return 0;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+ (void)picture;
+ return 0;
+}
+
+int WebPPictureView(const WebPPicture* src,
+ int left, int top, int width, int height,
+ WebPPicture* dst) {
+ (void)src;
+ (void)left;
+ (void)top;
+ (void)width;
+ (void)height;
+ (void)dst;
+ return 0;
+}
+
+int WebPPictureCrop(WebPPicture* pic,
+ int left, int top, int width, int height) {
+ (void)pic;
+ (void)left;
+ (void)top;
+ (void)width;
+ (void)height;
+ return 0;
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+ (void)pic;
+ (void)width;
+ (void)height;
+ return 0;
+}
+#endif // !defined(WEBP_REDUCE_SIZE)
diff --git a/media/libwebp/enc/picture_tools_enc.c b/media/libwebp/enc/picture_tools_enc.c
new file mode 100644
index 0000000000..02d48c5223
--- /dev/null
+++ b/media/libwebp/enc/picture_tools_enc.c
@@ -0,0 +1,273 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: alpha handling, etc.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../dsp/yuv.h"
+
+//------------------------------------------------------------------------------
+// Helper: clean up fully transparent area to help compressibility.
+
+#define SIZE 8
+#define SIZE2 (SIZE / 2)
+static int IsTransparentARGBArea(const uint32_t* ptr, int stride, int size) {
+ int y, x;
+ for (y = 0; y < size; ++y) {
+ for (x = 0; x < size; ++x) {
+ if (ptr[x] & 0xff000000u) {
+ return 0;
+ }
+ }
+ ptr += stride;
+ }
+ return 1;
+}
+
+static void Flatten(uint8_t* ptr, int v, int stride, int size) {
+ int y;
+ for (y = 0; y < size; ++y) {
+ memset(ptr, v, size);
+ ptr += stride;
+ }
+}
+
+static void FlattenARGB(uint32_t* ptr, uint32_t v, int stride, int size) {
+ int x, y;
+ for (y = 0; y < size; ++y) {
+ for (x = 0; x < size; ++x) ptr[x] = v;
+ ptr += stride;
+ }
+}
+
+// Smoothen the luma components of transparent pixels. Return true if the whole
+// block is transparent.
+static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr,
+ int y_stride, int width, int height) {
+ int sum = 0, count = 0;
+ int x, y;
+ const uint8_t* alpha_ptr = a_ptr;
+ uint8_t* luma_ptr = y_ptr;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; ++x) {
+ if (alpha_ptr[x] != 0) {
+ ++count;
+ sum += luma_ptr[x];
+ }
+ }
+ alpha_ptr += a_stride;
+ luma_ptr += y_stride;
+ }
+ if (count > 0 && count < width * height) {
+ const uint8_t avg_u8 = (uint8_t)(sum / count);
+ alpha_ptr = a_ptr;
+ luma_ptr = y_ptr;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; ++x) {
+ if (alpha_ptr[x] == 0) luma_ptr[x] = avg_u8;
+ }
+ alpha_ptr += a_stride;
+ luma_ptr += y_stride;
+ }
+ }
+ return (count == 0);
+}
+
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color) {
+ if (pic != NULL && pic->use_argb) {
+ int y = pic->height;
+ uint32_t* argb = pic->argb;
+ color &= 0xffffffu; // force alpha=0
+ WebPInitAlphaProcessing();
+ while (y-- > 0) {
+ WebPAlphaReplace(argb, pic->width, color);
+ argb += pic->argb_stride;
+ }
+ }
+}
+
+void WebPCleanupTransparentArea(WebPPicture* pic) {
+ int x, y, w, h;
+ if (pic == NULL) return;
+ w = pic->width / SIZE;
+ h = pic->height / SIZE;
+
+ // note: we ignore the left-overs on right/bottom, except for SmoothenBlock().
+ if (pic->use_argb) {
+ uint32_t argb_value = 0;
+ for (y = 0; y < h; ++y) {
+ int need_reset = 1;
+ for (x = 0; x < w; ++x) {
+ const int off = (y * pic->argb_stride + x) * SIZE;
+ if (IsTransparentARGBArea(pic->argb + off, pic->argb_stride, SIZE)) {
+ if (need_reset) {
+ argb_value = pic->argb[off];
+ need_reset = 0;
+ }
+ FlattenARGB(pic->argb + off, argb_value, pic->argb_stride, SIZE);
+ } else {
+ need_reset = 1;
+ }
+ }
+ }
+ } else {
+ const int width = pic->width;
+ const int height = pic->height;
+ const int y_stride = pic->y_stride;
+ const int uv_stride = pic->uv_stride;
+ const int a_stride = pic->a_stride;
+ uint8_t* y_ptr = pic->y;
+ uint8_t* u_ptr = pic->u;
+ uint8_t* v_ptr = pic->v;
+ const uint8_t* a_ptr = pic->a;
+ int values[3] = { 0 };
+ if (a_ptr == NULL || y_ptr == NULL || u_ptr == NULL || v_ptr == NULL) {
+ return;
+ }
+ for (y = 0; y + SIZE <= height; y += SIZE) {
+ int need_reset = 1;
+ for (x = 0; x + SIZE <= width; x += SIZE) {
+ if (SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+ SIZE, SIZE)) {
+ if (need_reset) {
+ values[0] = y_ptr[x];
+ values[1] = u_ptr[x >> 1];
+ values[2] = v_ptr[x >> 1];
+ need_reset = 0;
+ }
+ Flatten(y_ptr + x, values[0], y_stride, SIZE);
+ Flatten(u_ptr + (x >> 1), values[1], uv_stride, SIZE2);
+ Flatten(v_ptr + (x >> 1), values[2], uv_stride, SIZE2);
+ } else {
+ need_reset = 1;
+ }
+ }
+ if (x < width) {
+ SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+ width - x, SIZE);
+ }
+ a_ptr += SIZE * a_stride;
+ y_ptr += SIZE * y_stride;
+ u_ptr += SIZE2 * uv_stride;
+ v_ptr += SIZE2 * uv_stride;
+ }
+ if (y < height) {
+ const int sub_height = height - y;
+ for (x = 0; x + SIZE <= width; x += SIZE) {
+ SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+ SIZE, sub_height);
+ }
+ if (x < width) {
+ SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+ width - x, sub_height);
+ }
+ }
+ }
+}
+
+#undef SIZE
+#undef SIZE2
+
+//------------------------------------------------------------------------------
+// Blend color and remove transparency info
+
+#define BLEND(V0, V1, ALPHA) \
+ ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 256) >> 16)
+#define BLEND_10BIT(V0, V1, ALPHA) \
+ ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 1024) >> 18)
+
+static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
+ return (0xff000000u | (r << 16) | (g << 8) | b);
+}
+
+void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+ const int red = (background_rgb >> 16) & 0xff;
+ const int green = (background_rgb >> 8) & 0xff;
+ const int blue = (background_rgb >> 0) & 0xff;
+ int x, y;
+ if (pic == NULL) return;
+ if (!pic->use_argb) {
+ const int uv_width = (pic->width >> 1); // omit last pixel during u/v loop
+ const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
+ // VP8RGBToU/V expects the u/v values summed over four pixels
+ const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+ const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+ const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
+ uint8_t* y_ptr = pic->y;
+ uint8_t* u_ptr = pic->u;
+ uint8_t* v_ptr = pic->v;
+ uint8_t* a_ptr = pic->a;
+ if (!has_alpha || a_ptr == NULL) return; // nothing to do
+ for (y = 0; y < pic->height; ++y) {
+ // Luma blending
+ for (x = 0; x < pic->width; ++x) {
+ const uint8_t alpha = a_ptr[x];
+ if (alpha < 0xff) {
+ y_ptr[x] = BLEND(Y0, y_ptr[x], alpha);
+ }
+ }
+ // Chroma blending every even line
+ if ((y & 1) == 0) {
+ uint8_t* const a_ptr2 =
+ (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+ for (x = 0; x < uv_width; ++x) {
+ // Average four alpha values into a single blending weight.
+ // TODO(skal): might lead to visible contouring. Can we do better?
+ const uint32_t alpha =
+ a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
+ a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
+ u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
+ v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
+ }
+ if (pic->width & 1) { // rightmost pixel
+ const uint32_t alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
+ u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
+ v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
+ }
+ } else {
+ u_ptr += pic->uv_stride;
+ v_ptr += pic->uv_stride;
+ }
+ memset(a_ptr, 0xff, pic->width); // reset alpha value to opaque
+ a_ptr += pic->a_stride;
+ y_ptr += pic->y_stride;
+ }
+ } else {
+ uint32_t* argb = pic->argb;
+ const uint32_t background = MakeARGB32(red, green, blue);
+ for (y = 0; y < pic->height; ++y) {
+ for (x = 0; x < pic->width; ++x) {
+ const int alpha = (argb[x] >> 24) & 0xff;
+ if (alpha != 0xff) {
+ if (alpha > 0) {
+ int r = (argb[x] >> 16) & 0xff;
+ int g = (argb[x] >> 8) & 0xff;
+ int b = (argb[x] >> 0) & 0xff;
+ r = BLEND(red, r, alpha);
+ g = BLEND(green, g, alpha);
+ b = BLEND(blue, b, alpha);
+ argb[x] = MakeARGB32(r, g, b);
+ } else {
+ argb[x] = background;
+ }
+ }
+ }
+ argb += pic->argb_stride;
+ }
+ }
+}
+
+#undef BLEND
+#undef BLEND_10BIT
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/predictor_enc.c b/media/libwebp/enc/predictor_enc.c
new file mode 100644
index 0000000000..794c45cde6
--- /dev/null
+++ b/media/libwebp/enc/predictor_enc.c
@@ -0,0 +1,772 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+// Jyrki Alakuijala (jyrki@google.com)
+// Urvang Joshi (urvang@google.com)
+// Vincent Rabaud (vrabaud@google.com)
+
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../enc/vp8li_enc.h"
+
+#define MAX_DIFF_COST (1e30f)
+
+static const float kSpatialPredictorBias = 15.f;
+static const int kPredLowEffort = 11;
+static const uint32_t kMaskAlpha = 0xff000000;
+
+// Mostly used to reduce code size + readability
+static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+static float PredictionCostSpatial(const int counts[256], int weight_0,
+ double exp_val) {
+ const int significant_symbols = 256 >> 4;
+ const double exp_decay_factor = 0.6;
+ double bits = weight_0 * counts[0];
+ int i;
+ for (i = 1; i < significant_symbols; ++i) {
+ bits += exp_val * (counts[i] + counts[256 - i]);
+ exp_val *= exp_decay_factor;
+ }
+ return (float)(-0.1 * bits);
+}
+
+static float PredictionCostSpatialHistogram(const int accumulated[4][256],
+ const int tile[4][256]) {
+ int i;
+ double retval = 0;
+ for (i = 0; i < 4; ++i) {
+ const double kExpValue = 0.94;
+ retval += PredictionCostSpatial(tile[i], 1, kExpValue);
+ retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
+ }
+ return (float)retval;
+}
+
+static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
+ ++histo_argb[0][argb >> 24];
+ ++histo_argb[1][(argb >> 16) & 0xff];
+ ++histo_argb[2][(argb >> 8) & 0xff];
+ ++histo_argb[3][argb & 0xff];
+}
+
+//------------------------------------------------------------------------------
+// Spatial transform functions.
+
+static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
+ int num_pixels, const uint32_t* current,
+ const uint32_t* upper, uint32_t* out) {
+ if (x_start == 0) {
+ if (y == 0) {
+ // ARGB_BLACK.
+ VP8LPredictorsSub[0](current, NULL, 1, out);
+ } else {
+ // Top one.
+ VP8LPredictorsSub[2](current, upper, 1, out);
+ }
+ ++x_start;
+ ++out;
+ --num_pixels;
+ }
+ if (y == 0) {
+ // Left one.
+ VP8LPredictorsSub[1](current + x_start, NULL, num_pixels, out);
+ } else {
+ VP8LPredictorsSub[mode](current + x_start, upper + x_start, num_pixels,
+ out);
+ }
+}
+
+#if (WEBP_NEAR_LOSSLESS == 1)
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
+
+static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
+ const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
+ const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
+ const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
+ const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
+ return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
+}
+
+static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
+ uint32_t left, uint32_t right) {
+ const int diff_up = MaxDiffBetweenPixels(current, up);
+ const int diff_down = MaxDiffBetweenPixels(current, down);
+ const int diff_left = MaxDiffBetweenPixels(current, left);
+ const int diff_right = MaxDiffBetweenPixels(current, right);
+ return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
+}
+
+static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
+ const uint32_t green = (argb >> 8) & 0xff;
+ uint32_t red_blue = argb & 0x00ff00ffu;
+ red_blue += (green << 16) | green;
+ red_blue &= 0x00ff00ffu;
+ return (argb & 0xff00ff00u) | red_blue;
+}
+
+static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
+ uint8_t* const max_diffs, int used_subtract_green) {
+ uint32_t current, up, down, left, right;
+ int x;
+ if (width <= 2) return;
+ current = argb[0];
+ right = argb[1];
+ if (used_subtract_green) {
+ current = AddGreenToBlueAndRed(current);
+ right = AddGreenToBlueAndRed(right);
+ }
+ // max_diffs[0] and max_diffs[width - 1] are never used.
+ for (x = 1; x < width - 1; ++x) {
+ up = argb[-stride + x];
+ down = argb[stride + x];
+ left = current;
+ current = right;
+ right = argb[x + 1];
+ if (used_subtract_green) {
+ up = AddGreenToBlueAndRed(up);
+ down = AddGreenToBlueAndRed(down);
+ right = AddGreenToBlueAndRed(right);
+ }
+ max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
+ }
+}
+
+// Quantize the difference between the actual component value and its prediction
+// to a multiple of quantization, working modulo 256, taking care not to cross
+// a boundary (inclusive upper limit).
+static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
+ uint8_t boundary, int quantization) {
+ const int residual = (value - predict) & 0xff;
+ const int boundary_residual = (boundary - predict) & 0xff;
+ const int lower = residual & ~(quantization - 1);
+ const int upper = lower + quantization;
+ // Resolve ties towards a value closer to the prediction (i.e. towards lower
+ // if value comes after prediction and towards upper otherwise).
+ const int bias = ((boundary - value) & 0xff) < boundary_residual;
+ if (residual - lower < upper - residual + bias) {
+ // lower is closer to residual than upper.
+ if (residual > boundary_residual && lower <= boundary_residual) {
+ // Halve quantization step to avoid crossing boundary. This midpoint is
+ // on the same side of boundary as residual because midpoint >= residual
+ // (since lower is closer than upper) and residual is above the boundary.
+ return lower + (quantization >> 1);
+ }
+ return lower;
+ } else {
+ // upper is closer to residual than lower.
+ if (residual <= boundary_residual && upper > boundary_residual) {
+ // Halve quantization step to avoid crossing boundary. This midpoint is
+ // on the same side of boundary as residual because midpoint <= residual
+ // (since upper is closer than lower) and residual is below the boundary.
+ return lower + (quantization >> 1);
+ }
+ return upper & 0xff;
+ }
+}
+
+static WEBP_INLINE uint8_t NearLosslessDiff(uint8_t a, uint8_t b) {
+ return (uint8_t)((((int)(a) - (int)(b))) & 0xff);
+}
+
+// Quantize every component of the difference between the actual pixel value and
+// its prediction to a multiple of a quantization (a power of 2, not larger than
+// max_quantization which is a power of 2, smaller than max_diff). Take care if
+// value and predict have undergone subtract green, which means that red and
+// blue are represented as offsets from green.
+static uint32_t NearLossless(uint32_t value, uint32_t predict,
+ int max_quantization, int max_diff,
+ int used_subtract_green) {
+ int quantization;
+ uint8_t new_green = 0;
+ uint8_t green_diff = 0;
+ uint8_t a, r, g, b;
+ if (max_diff <= 2) {
+ return VP8LSubPixels(value, predict);
+ }
+ quantization = max_quantization;
+ while (quantization >= max_diff) {
+ quantization >>= 1;
+ }
+ if ((value >> 24) == 0 || (value >> 24) == 0xff) {
+ // Preserve transparency of fully transparent or fully opaque pixels.
+ a = NearLosslessDiff((value >> 24) & 0xff, (predict >> 24) & 0xff);
+ } else {
+ a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
+ }
+ g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
+ quantization);
+ if (used_subtract_green) {
+ // The green offset will be added to red and blue components during decoding
+ // to obtain the actual red and blue values.
+ new_green = ((predict >> 8) + g) & 0xff;
+ // The amount by which green has been adjusted during quantization. It is
+ // subtracted from red and blue for compensation, to avoid accumulating two
+ // quantization errors in them.
+ green_diff = NearLosslessDiff(new_green, (value >> 8) & 0xff);
+ }
+ r = NearLosslessComponent(NearLosslessDiff((value >> 16) & 0xff, green_diff),
+ (predict >> 16) & 0xff, 0xff - new_green,
+ quantization);
+ b = NearLosslessComponent(NearLosslessDiff(value & 0xff, green_diff),
+ predict & 0xff, 0xff - new_green, quantization);
+ return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
+}
+#endif // (WEBP_NEAR_LOSSLESS == 1)
+
+// Stores the difference between the pixel and its prediction in "out".
+// In case of a lossy encoding, updates the source image to avoid propagating
+// the deviation further to pixels which depend on the current pixel for their
+// predictions.
+static WEBP_INLINE void GetResidual(
+ int width, int height, uint32_t* const upper_row,
+ uint32_t* const current_row, const uint8_t* const max_diffs, int mode,
+ int x_start, int x_end, int y, int max_quantization, int exact,
+ int used_subtract_green, uint32_t* const out) {
+ if (exact) {
+ PredictBatch(mode, x_start, y, x_end - x_start, current_row, upper_row,
+ out);
+ } else {
+ const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
+ int x;
+ for (x = x_start; x < x_end; ++x) {
+ uint32_t predict;
+ uint32_t residual;
+ if (y == 0) {
+ predict = (x == 0) ? ARGB_BLACK : current_row[x - 1]; // Left.
+ } else if (x == 0) {
+ predict = upper_row[x]; // Top.
+ } else {
+ predict = pred_func(&current_row[x - 1], upper_row + x);
+ }
+#if (WEBP_NEAR_LOSSLESS == 1)
+ if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
+ x == 0 || x == width - 1) {
+ residual = VP8LSubPixels(current_row[x], predict);
+ } else {
+ residual = NearLossless(current_row[x], predict, max_quantization,
+ max_diffs[x], used_subtract_green);
+ // Update the source image.
+ current_row[x] = VP8LAddPixels(predict, residual);
+ // x is never 0 here so we do not need to update upper_row like below.
+ }
+#else
+ (void)max_diffs;
+ (void)height;
+ (void)max_quantization;
+ (void)used_subtract_green;
+ residual = VP8LSubPixels(current_row[x], predict);
+#endif
+ if ((current_row[x] & kMaskAlpha) == 0) {
+ // If alpha is 0, cleanup RGB. We can choose the RGB values of the
+ // residual for best compression. The prediction of alpha itself can be
+ // non-zero and must be kept though. We choose RGB of the residual to be
+ // 0.
+ residual &= kMaskAlpha;
+ // Update the source image.
+ current_row[x] = predict & ~kMaskAlpha;
+ // The prediction for the rightmost pixel in a row uses the leftmost
+ // pixel
+ // in that row as its top-right context pixel. Hence if we change the
+ // leftmost pixel of current_row, the corresponding change must be
+ // applied
+ // to upper_row as well where top-right context is being read from.
+ if (x == 0 && y != 0) upper_row[width] = current_row[0];
+ }
+ out[x - x_start] = residual;
+ }
+ }
+}
+
+// Returns best predictor and updates the accumulated histogram.
+// If max_quantization > 1, assumes that near lossless processing will be
+// applied, quantizing residuals to multiples of quantization levels up to
+// max_quantization (the actual quantization level depends on smoothness near
+// the given pixel).
+static int GetBestPredictorForTile(int width, int height,
+ int tile_x, int tile_y, int bits,
+ int accumulated[4][256],
+ uint32_t* const argb_scratch,
+ const uint32_t* const argb,
+ int max_quantization,
+ int exact, int used_subtract_green,
+ const uint32_t* const modes) {
+ const int kNumPredModes = 14;
+ const int start_x = tile_x << bits;
+ const int start_y = tile_y << bits;
+ const int tile_size = 1 << bits;
+ const int max_y = GetMin(tile_size, height - start_y);
+ const int max_x = GetMin(tile_size, width - start_x);
+ // Whether there exist columns just outside the tile.
+ const int have_left = (start_x > 0);
+ // Position and size of the strip covering the tile and adjacent columns if
+ // they exist.
+ const int context_start_x = start_x - have_left;
+#if (WEBP_NEAR_LOSSLESS == 1)
+ const int context_width = max_x + have_left + (max_x < width - start_x);
+#endif
+ const int tiles_per_row = VP8LSubSampleSize(width, bits);
+ // Prediction modes of the left and above neighbor tiles.
+ const int left_mode = (tile_x > 0) ?
+ (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
+ const int above_mode = (tile_y > 0) ?
+ (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
+ // The width of upper_row and current_row is one pixel larger than image width
+ // to allow the top right pixel to point to the leftmost pixel of the next row
+ // when at the right edge.
+ uint32_t* upper_row = argb_scratch;
+ uint32_t* current_row = upper_row + width + 1;
+ uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
+ float best_diff = MAX_DIFF_COST;
+ int best_mode = 0;
+ int mode;
+ int histo_stack_1[4][256];
+ int histo_stack_2[4][256];
+ // Need pointers to be able to swap arrays.
+ int (*histo_argb)[256] = histo_stack_1;
+ int (*best_histo)[256] = histo_stack_2;
+ int i, j;
+ uint32_t residuals[1 << MAX_TRANSFORM_BITS];
+ assert(bits <= MAX_TRANSFORM_BITS);
+ assert(max_x <= (1 << MAX_TRANSFORM_BITS));
+
+ for (mode = 0; mode < kNumPredModes; ++mode) {
+ float cur_diff;
+ int relative_y;
+ memset(histo_argb, 0, sizeof(histo_stack_1));
+ if (start_y > 0) {
+ // Read the row above the tile which will become the first upper_row.
+ // Include a pixel to the left if it exists; include a pixel to the right
+ // in all cases (wrapping to the leftmost pixel of the next row if it does
+ // not exist).
+ memcpy(current_row + context_start_x,
+ argb + (start_y - 1) * width + context_start_x,
+ sizeof(*argb) * (max_x + have_left + 1));
+ }
+ for (relative_y = 0; relative_y < max_y; ++relative_y) {
+ const int y = start_y + relative_y;
+ int relative_x;
+ uint32_t* tmp = upper_row;
+ upper_row = current_row;
+ current_row = tmp;
+ // Read current_row. Include a pixel to the left if it exists; include a
+ // pixel to the right in all cases except at the bottom right corner of
+ // the image (wrapping to the leftmost pixel of the next row if it does
+ // not exist in the current row).
+ memcpy(current_row + context_start_x,
+ argb + y * width + context_start_x,
+ sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
+#if (WEBP_NEAR_LOSSLESS == 1)
+ if (max_quantization > 1 && y >= 1 && y + 1 < height) {
+ MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
+ max_diffs + context_start_x, used_subtract_green);
+ }
+#endif
+
+ GetResidual(width, height, upper_row, current_row, max_diffs, mode,
+ start_x, start_x + max_x, y, max_quantization, exact,
+ used_subtract_green, residuals);
+ for (relative_x = 0; relative_x < max_x; ++relative_x) {
+ UpdateHisto(histo_argb, residuals[relative_x]);
+ }
+ }
+ cur_diff = PredictionCostSpatialHistogram(
+ (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
+ // Favor keeping the areas locally similar.
+ if (mode == left_mode) cur_diff -= kSpatialPredictorBias;
+ if (mode == above_mode) cur_diff -= kSpatialPredictorBias;
+
+ if (cur_diff < best_diff) {
+ int (*tmp)[256] = histo_argb;
+ histo_argb = best_histo;
+ best_histo = tmp;
+ best_diff = cur_diff;
+ best_mode = mode;
+ }
+ }
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 256; j++) {
+ accumulated[i][j] += best_histo[i][j];
+ }
+ }
+
+ return best_mode;
+}
+
+// Converts pixels of the image to residuals with respect to predictions.
+// If max_quantization > 1, applies near lossless processing, quantizing
+// residuals to multiples of quantization levels up to max_quantization
+// (the actual quantization level depends on smoothness near the given pixel).
+static void CopyImageWithPrediction(int width, int height,
+ int bits, uint32_t* const modes,
+ uint32_t* const argb_scratch,
+ uint32_t* const argb,
+ int low_effort, int max_quantization,
+ int exact, int used_subtract_green) {
+ const int tiles_per_row = VP8LSubSampleSize(width, bits);
+ // The width of upper_row and current_row is one pixel larger than image width
+ // to allow the top right pixel to point to the leftmost pixel of the next row
+ // when at the right edge.
+ uint32_t* upper_row = argb_scratch;
+ uint32_t* current_row = upper_row + width + 1;
+ uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
+#if (WEBP_NEAR_LOSSLESS == 1)
+ uint8_t* lower_max_diffs = current_max_diffs + width;
+#endif
+ int y;
+
+ for (y = 0; y < height; ++y) {
+ int x;
+ uint32_t* const tmp32 = upper_row;
+ upper_row = current_row;
+ current_row = tmp32;
+ memcpy(current_row, argb + y * width,
+ sizeof(*argb) * (width + (y + 1 < height)));
+
+ if (low_effort) {
+ PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row,
+ argb + y * width);
+ } else {
+#if (WEBP_NEAR_LOSSLESS == 1)
+ if (max_quantization > 1) {
+ // Compute max_diffs for the lower row now, because that needs the
+ // contents of argb for the current row, which we will overwrite with
+ // residuals before proceeding with the next row.
+ uint8_t* const tmp8 = current_max_diffs;
+ current_max_diffs = lower_max_diffs;
+ lower_max_diffs = tmp8;
+ if (y + 2 < height) {
+ MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
+ used_subtract_green);
+ }
+ }
+#endif
+ for (x = 0; x < width;) {
+ const int mode =
+ (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
+ int x_end = x + (1 << bits);
+ if (x_end > width) x_end = width;
+ GetResidual(width, height, upper_row, current_row, current_max_diffs,
+ mode, x, x_end, y, max_quantization, exact,
+ used_subtract_green, argb + y * width + x);
+ x = x_end;
+ }
+ }
+ }
+}
+
+// Finds the best predictor for each tile, and converts the image to residuals
+// with respect to predictions. If near_lossless_quality < 100, applies
+// near lossless processing, shaving off more bits of residuals for lower
+// qualities.
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+ uint32_t* const argb, uint32_t* const argb_scratch,
+ uint32_t* const image, int near_lossless_quality,
+ int exact, int used_subtract_green) {
+ const int tiles_per_row = VP8LSubSampleSize(width, bits);
+ const int tiles_per_col = VP8LSubSampleSize(height, bits);
+ int tile_y;
+ int histo[4][256];
+ const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
+ if (low_effort) {
+ int i;
+ for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
+ image[i] = ARGB_BLACK | (kPredLowEffort << 8);
+ }
+ } else {
+ memset(histo, 0, sizeof(histo));
+ for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+ int tile_x;
+ for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+ const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
+ bits, histo, argb_scratch, argb, max_quantization, exact,
+ used_subtract_green, image);
+ image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
+ }
+ }
+ }
+
+ CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
+ low_effort, max_quantization, exact,
+ used_subtract_green);
+}
+
+//------------------------------------------------------------------------------
+// Color transform functions.
+
+static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
+ m->green_to_red_ = 0;
+ m->green_to_blue_ = 0;
+ m->red_to_blue_ = 0;
+}
+
+static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
+ VP8LMultipliers* const m) {
+ m->green_to_red_ = (color_code >> 0) & 0xff;
+ m->green_to_blue_ = (color_code >> 8) & 0xff;
+ m->red_to_blue_ = (color_code >> 16) & 0xff;
+}
+
+static WEBP_INLINE uint32_t MultipliersToColorCode(
+ const VP8LMultipliers* const m) {
+ return 0xff000000u |
+ ((uint32_t)(m->red_to_blue_) << 16) |
+ ((uint32_t)(m->green_to_blue_) << 8) |
+ m->green_to_red_;
+}
+
+static float PredictionCostCrossColor(const int accumulated[256],
+ const int counts[256]) {
+ // Favor low entropy, locally and globally.
+ // Favor small absolute values for PredictionCostSpatial
+ static const double kExpValue = 2.4;
+ return VP8LCombinedShannonEntropy(counts, accumulated) +
+ PredictionCostSpatial(counts, 3, kExpValue);
+}
+
+static float GetPredictionCostCrossColorRed(
+ const uint32_t* argb, int stride, int tile_width, int tile_height,
+ VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
+ const int accumulated_red_histo[256]) {
+ int histo[256] = { 0 };
+ float cur_diff;
+
+ VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
+ green_to_red, histo);
+
+ cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
+ if ((uint8_t)green_to_red == prev_x.green_to_red_) {
+ cur_diff -= 3; // favor keeping the areas locally similar
+ }
+ if ((uint8_t)green_to_red == prev_y.green_to_red_) {
+ cur_diff -= 3; // favor keeping the areas locally similar
+ }
+ if (green_to_red == 0) {
+ cur_diff -= 3;
+ }
+ return cur_diff;
+}
+
+static void GetBestGreenToRed(
+ const uint32_t* argb, int stride, int tile_width, int tile_height,
+ VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+ const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
+ const int kMaxIters = 4 + ((7 * quality) >> 8); // in range [4..6]
+ int green_to_red_best = 0;
+ int iter, offset;
+ float best_diff = GetPredictionCostCrossColorRed(
+ argb, stride, tile_width, tile_height, prev_x, prev_y,
+ green_to_red_best, accumulated_red_histo);
+ for (iter = 0; iter < kMaxIters; ++iter) {
+ // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
+ // one in color computation. Having initial delta here as 1 is sufficient
+ // to explore the range of (-2, 2).
+ const int delta = 32 >> iter;
+ // Try a negative and a positive delta from the best known value.
+ for (offset = -delta; offset <= delta; offset += 2 * delta) {
+ const int green_to_red_cur = offset + green_to_red_best;
+ const float cur_diff = GetPredictionCostCrossColorRed(
+ argb, stride, tile_width, tile_height, prev_x, prev_y,
+ green_to_red_cur, accumulated_red_histo);
+ if (cur_diff < best_diff) {
+ best_diff = cur_diff;
+ green_to_red_best = green_to_red_cur;
+ }
+ }
+ }
+ best_tx->green_to_red_ = (green_to_red_best & 0xff);
+}
+
+static float GetPredictionCostCrossColorBlue(
+ const uint32_t* argb, int stride, int tile_width, int tile_height,
+ VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+ int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
+ int histo[256] = { 0 };
+ float cur_diff;
+
+ VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
+ green_to_blue, red_to_blue, histo);
+
+ cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
+ if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
+ cur_diff -= 3; // favor keeping the areas locally similar
+ }
+ if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
+ cur_diff -= 3; // favor keeping the areas locally similar
+ }
+ if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
+ cur_diff -= 3; // favor keeping the areas locally similar
+ }
+ if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
+ cur_diff -= 3; // favor keeping the areas locally similar
+ }
+ if (green_to_blue == 0) {
+ cur_diff -= 3;
+ }
+ if (red_to_blue == 0) {
+ cur_diff -= 3;
+ }
+ return cur_diff;
+}
+
+#define kGreenRedToBlueNumAxis 8
+#define kGreenRedToBlueMaxIters 7
+static void GetBestGreenRedToBlue(
+ const uint32_t* argb, int stride, int tile_width, int tile_height,
+ VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+ const int accumulated_blue_histo[256],
+ VP8LMultipliers* const best_tx) {
+ const int8_t offset[kGreenRedToBlueNumAxis][2] =
+ {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
+ const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
+ const int iters =
+ (quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
+ int green_to_blue_best = 0;
+ int red_to_blue_best = 0;
+ int iter;
+ // Initial value at origin:
+ float best_diff = GetPredictionCostCrossColorBlue(
+ argb, stride, tile_width, tile_height, prev_x, prev_y,
+ green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
+ for (iter = 0; iter < iters; ++iter) {
+ const int delta = delta_lut[iter];
+ int axis;
+ for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
+ const int green_to_blue_cur =
+ offset[axis][0] * delta + green_to_blue_best;
+ const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
+ const float cur_diff = GetPredictionCostCrossColorBlue(
+ argb, stride, tile_width, tile_height, prev_x, prev_y,
+ green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
+ if (cur_diff < best_diff) {
+ best_diff = cur_diff;
+ green_to_blue_best = green_to_blue_cur;
+ red_to_blue_best = red_to_blue_cur;
+ }
+ if (quality < 25 && iter == 4) {
+ // Only axis aligned diffs for lower quality.
+ break; // next iter.
+ }
+ }
+ if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
+ // Further iterations would not help.
+ break; // out of iter-loop.
+ }
+ }
+ best_tx->green_to_blue_ = green_to_blue_best & 0xff;
+ best_tx->red_to_blue_ = red_to_blue_best & 0xff;
+}
+#undef kGreenRedToBlueMaxIters
+#undef kGreenRedToBlueNumAxis
+
+static VP8LMultipliers GetBestColorTransformForTile(
+ int tile_x, int tile_y, int bits,
+ VP8LMultipliers prev_x,
+ VP8LMultipliers prev_y,
+ int quality, int xsize, int ysize,
+ const int accumulated_red_histo[256],
+ const int accumulated_blue_histo[256],
+ const uint32_t* const argb) {
+ const int max_tile_size = 1 << bits;
+ const int tile_y_offset = tile_y * max_tile_size;
+ const int tile_x_offset = tile_x * max_tile_size;
+ const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
+ const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
+ const int tile_width = all_x_max - tile_x_offset;
+ const int tile_height = all_y_max - tile_y_offset;
+ const uint32_t* const tile_argb = argb + tile_y_offset * xsize
+ + tile_x_offset;
+ VP8LMultipliers best_tx;
+ MultipliersClear(&best_tx);
+
+ GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
+ prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
+ GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
+ prev_x, prev_y, quality, accumulated_blue_histo,
+ &best_tx);
+ return best_tx;
+}
+
+static void CopyTileWithColorTransform(int xsize, int ysize,
+ int tile_x, int tile_y,
+ int max_tile_size,
+ VP8LMultipliers color_transform,
+ uint32_t* argb) {
+ const int xscan = GetMin(max_tile_size, xsize - tile_x);
+ int yscan = GetMin(max_tile_size, ysize - tile_y);
+ argb += tile_y * xsize + tile_x;
+ while (yscan-- > 0) {
+ VP8LTransformColor(&color_transform, argb, xscan);
+ argb += xsize;
+ }
+}
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+ uint32_t* const argb, uint32_t* image) {
+ const int max_tile_size = 1 << bits;
+ const int tile_xsize = VP8LSubSampleSize(width, bits);
+ const int tile_ysize = VP8LSubSampleSize(height, bits);
+ int accumulated_red_histo[256] = { 0 };
+ int accumulated_blue_histo[256] = { 0 };
+ int tile_x, tile_y;
+ VP8LMultipliers prev_x, prev_y;
+ MultipliersClear(&prev_y);
+ MultipliersClear(&prev_x);
+ for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+ for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+ int y;
+ const int tile_x_offset = tile_x * max_tile_size;
+ const int tile_y_offset = tile_y * max_tile_size;
+ const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
+ const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
+ const int offset = tile_y * tile_xsize + tile_x;
+ if (tile_y != 0) {
+ ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
+ }
+ prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
+ prev_x, prev_y,
+ quality, width, height,
+ accumulated_red_histo,
+ accumulated_blue_histo,
+ argb);
+ image[offset] = MultipliersToColorCode(&prev_x);
+ CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
+ max_tile_size, prev_x, argb);
+
+ // Gather accumulated histogram data.
+ for (y = tile_y_offset; y < all_y_max; ++y) {
+ int ix = y * width + tile_x_offset;
+ const int ix_end = ix + all_x_max - tile_x_offset;
+ for (; ix < ix_end; ++ix) {
+ const uint32_t pix = argb[ix];
+ if (ix >= 2 &&
+ pix == argb[ix - 2] &&
+ pix == argb[ix - 1]) {
+ continue; // repeated pixels are handled by backward references
+ }
+ if (ix >= width + 2 &&
+ argb[ix - 2] == argb[ix - width - 2] &&
+ argb[ix - 1] == argb[ix - width - 1] &&
+ pix == argb[ix - width]) {
+ continue; // repeated pixels are handled by backward references
+ }
+ ++accumulated_red_histo[(pix >> 16) & 0xff];
+ ++accumulated_blue_histo[(pix >> 0) & 0xff];
+ }
+ }
+ }
+ }
+}
diff --git a/media/libwebp/enc/quant_enc.c b/media/libwebp/enc/quant_enc.c
new file mode 100644
index 0000000000..029d62ca05
--- /dev/null
+++ b/media/libwebp/enc/quant_enc.c
@@ -0,0 +1,1388 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Quantization
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h> // for abs()
+
+#include "../dsp/quant.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
+
+#define DO_TRELLIS_I4 1
+#define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
+#define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
+#define USE_TDISTO 1
+
+#define MID_ALPHA 64 // neutral value for susceptibility
+#define MIN_ALPHA 30 // lowest usable value for susceptibility
+#define MAX_ALPHA 100 // higher meaningful value for susceptibility
+
+#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
+ // power-law modulation. Must be strictly less than 1.
+
+// number of non-zero coeffs below which we consider the block very flat
+// (and apply a penalty to complex predictions)
+#define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
+#define FLATNESS_LIMIT_I4 3 // I4 mode
+#define FLATNESS_LIMIT_UV 2 // UV mode
+#define FLATNESS_PENALTY 140 // roughly ~1bit per block
+
+#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
+
+#define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
+
+// #define DEBUG_BLOCK
+
+//------------------------------------------------------------------------------
+
+#if defined(DEBUG_BLOCK)
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static void PrintBlockInfo(const VP8EncIterator* const it,
+ const VP8ModeScore* const rd) {
+ int i, j;
+ const int is_i16 = (it->mb_->type_ == 1);
+ const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
+ const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
+ const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
+ const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
+ printf("SOURCE / OUTPUT / ABS DELTA\n");
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
+ printf(" ");
+ for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
+ printf(" ");
+ for (i = 0; i < 16; ++i) {
+ printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
+ }
+ printf("\n");
+ }
+ printf("\n"); // newline before the U/V block
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
+ printf(" ");
+ for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
+ printf(" ");
+ for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
+ printf(" ");
+ for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
+ printf(" ");
+ for (i = 0; i < 8; ++i) {
+ printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
+ }
+ printf(" ");
+ for (i = 8; i < 16; ++i) {
+ printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
+ }
+ printf("\n");
+ }
+ printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
+ (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
+ (int)rd->score);
+ if (is_i16) {
+ printf("Mode: %d\n", rd->mode_i16);
+ printf("y_dc_levels:");
+ for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
+ printf("\n");
+ } else {
+ printf("Modes[16]: ");
+ for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
+ printf("\n");
+ }
+ printf("y_ac_levels:\n");
+ for (j = 0; j < 16; ++j) {
+ for (i = is_i16 ? 1 : 0; i < 16; ++i) {
+ printf("%4d ", rd->y_ac_levels[j][i]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ printf("uv_levels (mode=%d):\n", rd->mode_uv);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 16; ++i) {
+ printf("%4d ", rd->uv_levels[j][i]);
+ }
+ printf("\n");
+ }
+}
+
+#endif // DEBUG_BLOCK
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int clip(int v, int m, int M) {
+ return v < m ? m : v > M ? M : v;
+}
+
+static const uint8_t kZigzag[16] = {
+ 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+static const uint8_t kDcTable[128] = {
+ 4, 5, 6, 7, 8, 9, 10, 10,
+ 11, 12, 13, 14, 15, 16, 17, 17,
+ 18, 19, 20, 20, 21, 21, 22, 22,
+ 23, 23, 24, 25, 25, 26, 27, 28,
+ 29, 30, 31, 32, 33, 34, 35, 36,
+ 37, 37, 38, 39, 40, 41, 42, 43,
+ 44, 45, 46, 46, 47, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58,
+ 59, 60, 61, 62, 63, 64, 65, 66,
+ 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 76, 77, 78, 79, 80, 81,
+ 82, 83, 84, 85, 86, 87, 88, 89,
+ 91, 93, 95, 96, 98, 100, 101, 102,
+ 104, 106, 108, 110, 112, 114, 116, 118,
+ 122, 124, 126, 128, 130, 132, 134, 136,
+ 138, 140, 143, 145, 148, 151, 154, 157
+};
+
+static const uint16_t kAcTable[128] = {
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27,
+ 28, 29, 30, 31, 32, 33, 34, 35,
+ 36, 37, 38, 39, 40, 41, 42, 43,
+ 44, 45, 46, 47, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 60,
+ 62, 64, 66, 68, 70, 72, 74, 76,
+ 78, 80, 82, 84, 86, 88, 90, 92,
+ 94, 96, 98, 100, 102, 104, 106, 108,
+ 110, 112, 114, 116, 119, 122, 125, 128,
+ 131, 134, 137, 140, 143, 146, 149, 152,
+ 155, 158, 161, 164, 167, 170, 173, 177,
+ 181, 185, 189, 193, 197, 201, 205, 209,
+ 213, 217, 221, 225, 229, 234, 239, 245,
+ 249, 254, 259, 264, 269, 274, 279, 284
+};
+
+static const uint16_t kAcTable2[128] = {
+ 8, 8, 9, 10, 12, 13, 15, 17,
+ 18, 20, 21, 23, 24, 26, 27, 29,
+ 31, 32, 34, 35, 37, 38, 40, 41,
+ 43, 44, 46, 48, 49, 51, 52, 54,
+ 55, 57, 58, 60, 62, 63, 65, 66,
+ 68, 69, 71, 72, 74, 75, 77, 79,
+ 80, 82, 83, 85, 86, 88, 89, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 124, 127, 130, 133, 136, 139, 142,
+ 145, 148, 151, 155, 158, 161, 164, 167,
+ 170, 173, 176, 179, 184, 189, 193, 198,
+ 203, 207, 212, 217, 221, 226, 230, 235,
+ 240, 244, 249, 254, 258, 263, 268, 274,
+ 280, 286, 292, 299, 305, 311, 317, 323,
+ 330, 336, 342, 348, 354, 362, 370, 379,
+ 385, 393, 401, 409, 416, 424, 432, 440
+};
+
+static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
+ { 96, 110 }, { 96, 108 }, { 110, 115 }
+};
+
+// Sharpening by (slightly) raising the hi-frequency coeffs.
+// Hack-ish but helpful for mid-bitrate range. Use with care.
+#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
+static const uint8_t kFreqSharpening[16] = {
+ 0, 30, 60, 90,
+ 30, 60, 90, 90,
+ 60, 90, 90, 90,
+ 90, 90, 90, 90
+};
+
+//------------------------------------------------------------------------------
+// Initialize quantization parameters in VP8Matrix
+
+// Returns the average quantizer
+static int ExpandMatrix(VP8Matrix* const m, int type) {
+ int i, sum;
+ for (i = 0; i < 2; ++i) {
+ const int is_ac_coeff = (i > 0);
+ const int bias = kBiasMatrices[type][is_ac_coeff];
+ m->iq_[i] = (1 << QFIX) / m->q_[i];
+ m->bias_[i] = BIAS(bias);
+ // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
+ // * zero if coeff <= zthresh
+ // * non-zero if coeff > zthresh
+ m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
+ }
+ for (i = 2; i < 16; ++i) {
+ m->q_[i] = m->q_[1];
+ m->iq_[i] = m->iq_[1];
+ m->bias_[i] = m->bias_[1];
+ m->zthresh_[i] = m->zthresh_[1];
+ }
+ for (sum = 0, i = 0; i < 16; ++i) {
+ if (type == 0) { // we only use sharpening for AC luma coeffs
+ m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
+ } else {
+ m->sharpen_[i] = 0;
+ }
+ sum += m->q_[i];
+ }
+ return (sum + 8) >> 4;
+}
+
+static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
+
+static void SetupMatrices(VP8Encoder* enc) {
+ int i;
+ const int tlambda_scale =
+ (enc->method_ >= 4) ? enc->config_->sns_strength
+ : 0;
+ const int num_segments = enc->segment_hdr_.num_segments_;
+ for (i = 0; i < num_segments; ++i) {
+ VP8SegmentInfo* const m = &enc->dqm_[i];
+ const int q = m->quant_;
+ int q_i4, q_i16, q_uv;
+ m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
+ m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
+
+ m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
+ m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
+
+ m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
+ m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
+
+ q_i4 = ExpandMatrix(&m->y1_, 0);
+ q_i16 = ExpandMatrix(&m->y2_, 1);
+ q_uv = ExpandMatrix(&m->uv_, 2);
+
+ m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
+ m->lambda_i16_ = (3 * q_i16 * q_i16);
+ m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
+ m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
+ m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
+ m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
+ m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
+ m->tlambda_ = (tlambda_scale * q_i4) >> 5;
+
+ // none of these constants should be < 1
+ CheckLambdaValue(&m->lambda_i4_);
+ CheckLambdaValue(&m->lambda_i16_);
+ CheckLambdaValue(&m->lambda_uv_);
+ CheckLambdaValue(&m->lambda_mode_);
+ CheckLambdaValue(&m->lambda_trellis_i4_);
+ CheckLambdaValue(&m->lambda_trellis_i16_);
+ CheckLambdaValue(&m->lambda_trellis_uv_);
+ CheckLambdaValue(&m->tlambda_);
+
+ m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto
+ m->max_edge_ = 0;
+
+ m->i4_penalty_ = 1000 * q_i4 * q_i4;
+ }
+}
+
+//------------------------------------------------------------------------------
+// Initialize filtering parameters
+
+// Very small filter-strength values have close to no visual effect. So we can
+// save a little decoding-CPU by turning filtering off for these.
+#define FSTRENGTH_CUTOFF 2
+
+static void SetupFilterStrength(VP8Encoder* const enc) {
+ int i;
+ // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
+ const int level0 = 5 * enc->config_->filter_strength;
+ for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
+ VP8SegmentInfo* const m = &enc->dqm_[i];
+ // We focus on the quantization of AC coeffs.
+ const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
+ const int base_strength =
+ VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
+ // Segments with lower complexity ('beta') will be less filtered.
+ const int f = base_strength * level0 / (256 + m->beta_);
+ m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
+ }
+ // We record the initial strength (mainly for the case of 1-segment only).
+ enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
+ enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
+ enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
+}
+
+//------------------------------------------------------------------------------
+
+// Note: if you change the values below, remember that the max range
+// allowed by the syntax for DQ_UV is [-16,16].
+#define MAX_DQ_UV (6)
+#define MIN_DQ_UV (-4)
+
+// We want to emulate jpeg-like behaviour where the expected "good" quality
+// is around q=75. Internally, our "good" middle is around c=50. So we
+// map accordingly using linear piece-wise function
+static double QualityToCompression(double c) {
+ const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
+ // The file size roughly scales as pow(quantizer, 3.). Actually, the
+ // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
+ // in the mid-quant range. So we scale the compressibility inversely to
+ // this power-law: quant ~= compression ^ 1/3. This law holds well for
+ // low quant. Finer modeling for high-quant would make use of kAcTable[]
+ // more explicitly.
+ const double v = pow(linear_c, 1 / 3.);
+ return v;
+}
+
+static double QualityToJPEGCompression(double c, double alpha) {
+ // We map the complexity 'alpha' and quality setting 'c' to a compression
+ // exponent empirically matched to the compression curve of libjpeg6b.
+ // On average, the WebP output size will be roughly similar to that of a
+ // JPEG file compressed with same quality factor.
+ const double amin = 0.30;
+ const double amax = 0.85;
+ const double exp_min = 0.4;
+ const double exp_max = 0.9;
+ const double slope = (exp_min - exp_max) / (amax - amin);
+ // Linearly interpolate 'expn' from exp_min to exp_max
+ // in the [amin, amax] range.
+ const double expn = (alpha > amax) ? exp_min
+ : (alpha < amin) ? exp_max
+ : exp_max + slope * (alpha - amin);
+ const double v = pow(c, expn);
+ return v;
+}
+
+static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
+ const VP8SegmentInfo* const S2) {
+ return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
+}
+
+static void SimplifySegments(VP8Encoder* const enc) {
+ int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
+ // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+ // explicit check is needed to avoid a spurious warning about 'i' exceeding
+ // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
+ const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
+ ? enc->segment_hdr_.num_segments_
+ : NUM_MB_SEGMENTS;
+ int num_final_segments = 1;
+ int s1, s2;
+ for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
+ const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
+ int found = 0;
+ // check if we already have similar segment
+ for (s2 = 0; s2 < num_final_segments; ++s2) {
+ const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
+ if (SegmentsAreEquivalent(S1, S2)) {
+ found = 1;
+ break;
+ }
+ }
+ map[s1] = s2;
+ if (!found) {
+ if (num_final_segments != s1) {
+ enc->dqm_[num_final_segments] = enc->dqm_[s1];
+ }
+ ++num_final_segments;
+ }
+ }
+ if (num_final_segments < num_segments) { // Remap
+ int i = enc->mb_w_ * enc->mb_h_;
+ while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
+ enc->segment_hdr_.num_segments_ = num_final_segments;
+ // Replicate the trailing segment infos (it's mostly cosmetics)
+ for (i = num_final_segments; i < num_segments; ++i) {
+ enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
+ }
+ }
+}
+
+void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
+ int i;
+ int dq_uv_ac, dq_uv_dc;
+ const int num_segments = enc->segment_hdr_.num_segments_;
+ const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
+ const double Q = quality / 100.;
+ const double c_base = enc->config_->emulate_jpeg_size ?
+ QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
+ QualityToCompression(Q);
+ for (i = 0; i < num_segments; ++i) {
+ // We modulate the base coefficient to accommodate for the quantization
+ // susceptibility and allow denser segments to be quantized more.
+ const double expn = 1. - amp * enc->dqm_[i].alpha_;
+ const double c = pow(c_base, expn);
+ const int q = (int)(127. * (1. - c));
+ assert(expn > 0.);
+ enc->dqm_[i].quant_ = clip(q, 0, 127);
+ }
+
+ // purely indicative in the bitstream (except for the 1-segment case)
+ enc->base_quant_ = enc->dqm_[0].quant_;
+
+ // fill-in values for the unused segments (required by the syntax)
+ for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
+ enc->dqm_[i].quant_ = enc->base_quant_;
+ }
+
+ // uv_alpha_ is normally spread around ~60. The useful range is
+ // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
+ // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
+ dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
+ / (MAX_ALPHA - MIN_ALPHA);
+ // we rescale by the user-defined strength of adaptation
+ dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
+ // and make it safe.
+ dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
+ // We also boost the dc-uv-quant a little, based on sns-strength, since
+ // U/V channels are quite more reactive to high quants (flat DC-blocks
+ // tend to appear, and are unpleasant).
+ dq_uv_dc = -4 * enc->config_->sns_strength / 100;
+ dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
+
+ enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
+ enc->dq_y2_dc_ = 0;
+ enc->dq_y2_ac_ = 0;
+ enc->dq_uv_dc_ = dq_uv_dc;
+ enc->dq_uv_ac_ = dq_uv_ac;
+
+ SetupFilterStrength(enc); // initialize segments' filtering, eventually
+
+ if (num_segments > 1) SimplifySegments(enc);
+
+ SetupMatrices(enc); // finalize quantization matrices
+}
+
+//------------------------------------------------------------------------------
+// Form the predictions in cache
+
+// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
+const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
+const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
+
+// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
+const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
+ I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
+};
+
+void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
+ const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
+ const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
+ VP8EncPredLuma16(it->yuv_p_, left, top);
+}
+
+void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
+ const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
+ const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
+ VP8EncPredChroma8(it->yuv_p_, left, top);
+}
+
+void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
+ VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
+}
+
+//------------------------------------------------------------------------------
+// Quantize
+
+// Layout:
+// +----+----+
+// |YYYY|UUVV| 0
+// |YYYY|UUVV| 4
+// |YYYY|....| 8
+// |YYYY|....| 12
+// +----+----+
+
+const uint16_t VP8Scan[16] = { // Luma
+ 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
+ 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
+ 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
+ 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+};
+
+static const uint16_t VP8ScanUV[4 + 4] = {
+ 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
+ 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
+};
+
+//------------------------------------------------------------------------------
+// Distortion measurement
+
+static const uint16_t kWeightY[16] = {
+ 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
+};
+
+static const uint16_t kWeightTrellis[16] = {
+#if USE_TDISTO == 0
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+#else
+ 30, 27, 19, 11,
+ 27, 24, 17, 10,
+ 19, 17, 12, 8,
+ 11, 10, 8, 6
+#endif
+};
+
+// Init/Copy the common fields in score.
+static void InitScore(VP8ModeScore* const rd) {
+ rd->D = 0;
+ rd->SD = 0;
+ rd->R = 0;
+ rd->H = 0;
+ rd->nz = 0;
+ rd->score = MAX_COST;
+}
+
+static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+ dst->D = src->D;
+ dst->SD = src->SD;
+ dst->R = src->R;
+ dst->H = src->H;
+ dst->nz = src->nz; // note that nz is not accumulated, but just copied.
+ dst->score = src->score;
+}
+
+static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+ dst->D += src->D;
+ dst->SD += src->SD;
+ dst->R += src->R;
+ dst->H += src->H;
+ dst->nz |= src->nz; // here, new nz bits are accumulated.
+ dst->score += src->score;
+}
+
+//------------------------------------------------------------------------------
+// Performs trellis-optimized quantization.
+
+// Trellis node
+typedef struct {
+ int8_t prev; // best previous node
+ int8_t sign; // sign of coeff_i
+ int16_t level; // level
+} Node;
+
+// Score state
+typedef struct {
+ score_t score; // partial RD score
+ const uint16_t* costs; // shortcut to cost tables
+} ScoreState;
+
+// If a coefficient was quantized to a value Q (using a neutral bias),
+// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
+// We don't test negative values though.
+#define MIN_DELTA 0 // how much lower level to try
+#define MAX_DELTA 1 // how much higher
+#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
+#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
+#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
+
+static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
+ rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
+}
+
+static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
+ score_t distortion) {
+ return rate * lambda + RD_DISTO_MULT * distortion;
+}
+
+// Coefficient type.
+enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
+
+static int TrellisQuantizeBlock(const VP8Encoder* const enc,
+ int16_t in[16], int16_t out[16],
+ int ctx0, int coeff_type,
+ const VP8Matrix* const mtx,
+ int lambda) {
+ const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
+ CostArrayPtr const costs =
+ (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
+ const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
+ Node nodes[16][NUM_NODES];
+ ScoreState score_states[2][NUM_NODES];
+ ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
+ ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
+ int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
+ score_t best_score;
+ int n, m, p, last;
+
+ {
+ score_t cost;
+ const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
+ const int last_proba = probas[VP8EncBands[first]][ctx0][0];
+
+ // compute the position of the last interesting coefficient
+ last = first - 1;
+ for (n = 15; n >= first; --n) {
+ const int j = kZigzag[n];
+ const int err = in[j] * in[j];
+ if (err > thresh) {
+ last = n;
+ break;
+ }
+ }
+ // we don't need to go inspect up to n = 16 coeffs. We can just go up
+ // to last + 1 (inclusive) without losing much.
+ if (last < 15) ++last;
+
+ // compute 'skip' score. This is the max score one can do.
+ cost = VP8BitCost(0, last_proba);
+ best_score = RDScoreTrellis(lambda, cost, 0);
+
+ // initialize source node.
+ for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
+ const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
+ ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
+ ss_cur[m].costs = costs[first][ctx0];
+ }
+ }
+
+ // traverse trellis.
+ for (n = first; n <= last; ++n) {
+ const int j = kZigzag[n];
+ const uint32_t Q = mtx->q_[j];
+ const uint32_t iQ = mtx->iq_[j];
+ const uint32_t B = BIAS(0x00); // neutral bias
+ // note: it's important to take sign of the _original_ coeff,
+ // so we don't have to consider level < 0 afterward.
+ const int sign = (in[j] < 0);
+ const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+ int level0 = QUANTDIV(coeff0, iQ, B);
+ int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
+ if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
+ if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
+
+ { // Swap current and previous score states
+ ScoreState* const tmp = ss_cur;
+ ss_cur = ss_prev;
+ ss_prev = tmp;
+ }
+
+ // test all alternate level values around level0.
+ for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
+ Node* const cur = &NODE(n, m);
+ const int level = level0 + m;
+ const int ctx = (level > 2) ? 2 : level;
+ const int band = VP8EncBands[n + 1];
+ score_t base_score;
+ score_t best_cur_score;
+ int best_prev;
+ score_t cost, score;
+
+ ss_cur[m].costs = costs[n + 1][ctx];
+ if (level < 0 || level > thresh_level) {
+ ss_cur[m].score = MAX_COST;
+ // Node is dead.
+ continue;
+ }
+
+ {
+ // Compute delta_error = how much coding this level will
+ // subtract to max_error as distortion.
+ // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
+ const int new_error = coeff0 - level * Q;
+ const int delta_error =
+ kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
+ base_score = RDScoreTrellis(lambda, 0, delta_error);
+ }
+
+ // Inspect all possible non-dead predecessors. Retain only the best one.
+ // The base_score is added to all scores so it is only added for the final
+ // value after the loop.
+ cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
+ best_cur_score =
+ ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
+ best_prev = -MIN_DELTA;
+ for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
+ // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
+ // eliminated since their score can't be better than the current best.
+ cost = VP8LevelCost(ss_prev[p].costs, level);
+ // Examine node assuming it's a non-terminal one.
+ score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
+ if (score < best_cur_score) {
+ best_cur_score = score;
+ best_prev = p;
+ }
+ }
+ best_cur_score += base_score;
+ // Store best finding in current node.
+ cur->sign = sign;
+ cur->level = level;
+ cur->prev = best_prev;
+ ss_cur[m].score = best_cur_score;
+
+ // Now, record best terminal node (and thus best entry in the graph).
+ if (level != 0 && best_cur_score < best_score) {
+ const score_t last_pos_cost =
+ (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
+ const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
+ score = best_cur_score + last_pos_score;
+ if (score < best_score) {
+ best_score = score;
+ best_path[0] = n; // best eob position
+ best_path[1] = m; // best node index
+ best_path[2] = best_prev; // best predecessor
+ }
+ }
+ }
+ }
+
+ // Fresh start
+ // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
+ if (coeff_type == TYPE_I16_AC) {
+ memset(in + 1, 0, 15 * sizeof(*in));
+ memset(out + 1, 0, 15 * sizeof(*out));
+ } else {
+ memset(in, 0, 16 * sizeof(*in));
+ memset(out, 0, 16 * sizeof(*out));
+ }
+ if (best_path[0] == -1) {
+ return 0; // skip!
+ }
+
+ {
+ // Unwind the best path.
+ // Note: best-prev on terminal node is not necessarily equal to the
+ // best_prev for non-terminal. So we patch best_path[2] in.
+ int nz = 0;
+ int best_node = best_path[1];
+ n = best_path[0];
+ NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
+
+ for (; n >= first; --n) {
+ const Node* const node = &NODE(n, best_node);
+ const int j = kZigzag[n];
+ out[n] = node->sign ? -node->level : node->level;
+ nz |= node->level;
+ in[j] = out[n] * mtx->q_[j];
+ best_node = node->prev;
+ }
+ return (nz != 0);
+ }
+}
+
+#undef NODE
+
+//------------------------------------------------------------------------------
+// Performs: difference, transform, quantize, back-transform, add
+// all at once. Output is the reconstructed block in *yuv_out, and the
+// quantized levels in *levels.
+
+static int ReconstructIntra16(VP8EncIterator* const it,
+ VP8ModeScore* const rd,
+ uint8_t* const yuv_out,
+ int mode) {
+ const VP8Encoder* const enc = it->enc_;
+ const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
+ const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+ const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+ int nz = 0;
+ int n;
+ int16_t tmp[16][16], dc_tmp[16];
+
+ for (n = 0; n < 16; n += 2) {
+ VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
+ }
+ VP8FTransformWHT(tmp[0], dc_tmp);
+ nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
+
+ if (DO_TRELLIS_I16 && it->do_trellis_) {
+ int x, y;
+ VP8IteratorNzToBytes(it);
+ for (y = 0, n = 0; y < 4; ++y) {
+ for (x = 0; x < 4; ++x, ++n) {
+ const int ctx = it->top_nz_[x] + it->left_nz_[y];
+ const int non_zero = TrellisQuantizeBlock(
+ enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
+ dqm->lambda_trellis_i16_);
+ it->top_nz_[x] = it->left_nz_[y] = non_zero;
+ rd->y_ac_levels[n][0] = 0;
+ nz |= non_zero << n;
+ }
+ }
+ } else {
+ for (n = 0; n < 16; n += 2) {
+ // Zero-out the first coeff, so that: a) nz is correct below, and
+ // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
+ tmp[n][0] = tmp[n + 1][0] = 0;
+ nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
+ assert(rd->y_ac_levels[n + 0][0] == 0);
+ assert(rd->y_ac_levels[n + 1][0] == 0);
+ }
+ }
+
+ // Transform back
+ VP8TransformWHT(dc_tmp, tmp[0]);
+ for (n = 0; n < 16; n += 2) {
+ VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
+ }
+
+ return nz;
+}
+
+static int ReconstructIntra4(VP8EncIterator* const it,
+ int16_t levels[16],
+ const uint8_t* const src,
+ uint8_t* const yuv_out,
+ int mode) {
+ const VP8Encoder* const enc = it->enc_;
+ const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
+ const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+ int nz = 0;
+ int16_t tmp[16];
+
+ VP8FTransform(src, ref, tmp);
+ if (DO_TRELLIS_I4 && it->do_trellis_) {
+ const int x = it->i4_ & 3, y = it->i4_ >> 2;
+ const int ctx = it->top_nz_[x] + it->left_nz_[y];
+ nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
+ dqm->lambda_trellis_i4_);
+ } else {
+ nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
+ }
+ VP8ITransform(ref, tmp, yuv_out, 0);
+ return nz;
+}
+
+//------------------------------------------------------------------------------
+// DC-error diffusion
+
+// Diffusion weights. We under-correct a bit (15/16th of the error is actually
+// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
+#define C1 7 // fraction of error sent to the 4x4 block below
+#define C2 8 // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 4
+#define DSCALE 1 // storage descaling, needed to make the error fit int8_t
+
+// Quantize as usual, but also compute and return the quantization error.
+// Error is already divided by DSHIFT.
+static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+ int V = *v;
+ const int sign = (V < 0);
+ if (sign) V = -V;
+ if (V > (int)mtx->zthresh_[0]) {
+ const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
+ const int err = (V - qV);
+ *v = sign ? -qV : qV;
+ return (sign ? -err : err) >> DSCALE;
+ }
+ *v = 0;
+ return (sign ? -V : V) >> DSCALE;
+}
+
+static void CorrectDCValues(const VP8EncIterator* const it,
+ const VP8Matrix* const mtx,
+ int16_t tmp[][16], VP8ModeScore* const rd) {
+ // | top[0] | top[1]
+ // --------+--------+---------
+ // left[0] | tmp[0] tmp[1] <-> err0 err1
+ // left[1] | tmp[2] tmp[3] err2 err3
+ //
+ // Final errors {err1,err2,err3} are preserved and later restored
+ // as top[]/left[] on the next block.
+ int ch;
+ for (ch = 0; ch <= 1; ++ch) {
+ const int8_t* const top = it->top_derr_[it->x_][ch];
+ const int8_t* const left = it->left_derr_[ch];
+ int16_t (* const c)[16] = &tmp[ch * 4];
+ int err0, err1, err2, err3;
+ c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
+ err0 = QuantizeSingle(&c[0][0], mtx);
+ c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
+ err1 = QuantizeSingle(&c[1][0], mtx);
+ c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
+ err2 = QuantizeSingle(&c[2][0], mtx);
+ c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
+ err3 = QuantizeSingle(&c[3][0], mtx);
+ // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
+ // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
+ assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
+ rd->derr[ch][0] = (int8_t)err1;
+ rd->derr[ch][1] = (int8_t)err2;
+ rd->derr[ch][2] = (int8_t)err3;
+ }
+}
+
+static void StoreDiffusionErrors(VP8EncIterator* const it,
+ const VP8ModeScore* const rd) {
+ int ch;
+ for (ch = 0; ch <= 1; ++ch) {
+ int8_t* const top = it->top_derr_[it->x_][ch];
+ int8_t* const left = it->left_derr_[ch];
+ left[0] = rd->derr[ch][0]; // restore err1
+ left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
+ top[0] = rd->derr[ch][1]; // ... err2
+ top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
+ }
+}
+
+#undef C1
+#undef C2
+#undef DSHIFT
+#undef DSCALE
+
+//------------------------------------------------------------------------------
+
+static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
+ uint8_t* const yuv_out, int mode) {
+ const VP8Encoder* const enc = it->enc_;
+ const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
+ const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+ const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+ int nz = 0;
+ int n;
+ int16_t tmp[8][16];
+
+ for (n = 0; n < 8; n += 2) {
+ VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
+ }
+ if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
+
+ if (DO_TRELLIS_UV && it->do_trellis_) {
+ int ch, x, y;
+ for (ch = 0, n = 0; ch <= 2; ch += 2) {
+ for (y = 0; y < 2; ++y) {
+ for (x = 0; x < 2; ++x, ++n) {
+ const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+ const int non_zero = TrellisQuantizeBlock(
+ enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
+ dqm->lambda_trellis_uv_);
+ it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
+ nz |= non_zero << n;
+ }
+ }
+ }
+ } else {
+ for (n = 0; n < 8; n += 2) {
+ nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
+ }
+ }
+
+ for (n = 0; n < 8; n += 2) {
+ VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
+ }
+ return (nz << 16);
+}
+
+//------------------------------------------------------------------------------
+// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
+// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
+
+static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
+ // We look at the first three AC coefficients to determine what is the average
+ // delta between each sub-4x4 block.
+ const int v0 = abs(DCs[1]);
+ const int v1 = abs(DCs[2]);
+ const int v2 = abs(DCs[4]);
+ int max_v = (v1 > v0) ? v1 : v0;
+ max_v = (v2 > max_v) ? v2 : max_v;
+ if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
+}
+
+static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
+ VP8ModeScore* const tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+static void SwapPtr(uint8_t** a, uint8_t** b) {
+ uint8_t* const tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+static void SwapOut(VP8EncIterator* const it) {
+ SwapPtr(&it->yuv_out_, &it->yuv_out2_);
+}
+
+static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
+ const int kNumBlocks = 16;
+ VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+ const int lambda = dqm->lambda_i16_;
+ const int tlambda = dqm->tlambda_;
+ const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+ VP8ModeScore rd_tmp;
+ VP8ModeScore* rd_cur = &rd_tmp;
+ VP8ModeScore* rd_best = rd;
+ int mode;
+ int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
+
+ rd->mode_i16 = -1;
+ for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+ uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
+ rd_cur->mode_i16 = mode;
+
+ // Reconstruct
+ rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
+
+ // Measure RD-score
+ rd_cur->D = VP8SSE16x16(src, tmp_dst);
+ rd_cur->SD =
+ tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
+ rd_cur->H = VP8FixedCostsI16[mode];
+ rd_cur->R = VP8GetCostLuma16(it, rd_cur);
+ if (is_flat) {
+ // refine the first impression (which was in pixel space)
+ is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
+ if (is_flat) {
+ // Block is very flat. We put emphasis on the distortion being very low!
+ rd_cur->D *= 2;
+ rd_cur->SD *= 2;
+ }
+ }
+
+ // Since we always examine Intra16 first, we can overwrite *rd directly.
+ SetRDScore(lambda, rd_cur);
+ if (mode == 0 || rd_cur->score < rd_best->score) {
+ SwapModeScore(&rd_cur, &rd_best);
+ SwapOut(it);
+ }
+ }
+ if (rd_best != rd) {
+ memcpy(rd, rd_best, sizeof(*rd));
+ }
+ SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
+ VP8SetIntra16Mode(it, rd->mode_i16);
+
+ // we have a blocky macroblock (only DCs are non-zero) with fairly high
+ // distortion, record max delta so we can later adjust the minimal filtering
+ // strength needed to smooth these blocks out.
+ if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
+ StoreMaxDelta(dqm, rd->y_dc_levels);
+ }
+}
+
+//------------------------------------------------------------------------------
+
+// return the cost array corresponding to the surrounding prediction modes.
+static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
+ const uint8_t modes[16]) {
+ const int preds_w = it->enc_->preds_w_;
+ const int x = (it->i4_ & 3), y = it->i4_ >> 2;
+ const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
+ const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
+ return VP8FixedCostsI4[top][left];
+}
+
+static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
+ const VP8Encoder* const enc = it->enc_;
+ const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+ const int lambda = dqm->lambda_i4_;
+ const int tlambda = dqm->tlambda_;
+ const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
+ uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
+ int total_header_bits = 0;
+ VP8ModeScore rd_best;
+
+ if (enc->max_i4_header_bits_ == 0) {
+ return 0;
+ }
+
+ InitScore(&rd_best);
+ rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
+ SetRDScore(dqm->lambda_mode_, &rd_best);
+ VP8IteratorStartI4(it);
+ do {
+ const int kNumBlocks = 1;
+ VP8ModeScore rd_i4;
+ int mode;
+ int best_mode = -1;
+ const uint8_t* const src = src0 + VP8Scan[it->i4_];
+ const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
+ uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
+ uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
+
+ InitScore(&rd_i4);
+ VP8MakeIntra4Preds(it);
+ for (mode = 0; mode < NUM_BMODES; ++mode) {
+ VP8ModeScore rd_tmp;
+ int16_t tmp_levels[16];
+
+ // Reconstruct
+ rd_tmp.nz =
+ ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
+
+ // Compute RD-score
+ rd_tmp.D = VP8SSE4x4(src, tmp_dst);
+ rd_tmp.SD =
+ tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
+ : 0;
+ rd_tmp.H = mode_costs[mode];
+
+ // Add flatness penalty, to avoid flat area to be mispredicted
+ // by a complex mode.
+ if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
+ rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
+ } else {
+ rd_tmp.R = 0;
+ }
+
+ // early-out check
+ SetRDScore(lambda, &rd_tmp);
+ if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
+
+ // finish computing score
+ rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
+ SetRDScore(lambda, &rd_tmp);
+
+ if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
+ CopyScore(&rd_i4, &rd_tmp);
+ best_mode = mode;
+ SwapPtr(&tmp_dst, &best_block);
+ memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
+ sizeof(rd_best.y_ac_levels[it->i4_]));
+ }
+ }
+ SetRDScore(dqm->lambda_mode_, &rd_i4);
+ AddScore(&rd_best, &rd_i4);
+ if (rd_best.score >= rd->score) {
+ return 0;
+ }
+ total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
+ if (total_header_bits > enc->max_i4_header_bits_) {
+ return 0;
+ }
+ // Copy selected samples if not in the right place already.
+ if (best_block != best_blocks + VP8Scan[it->i4_]) {
+ VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
+ }
+ rd->modes_i4[it->i4_] = best_mode;
+ it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
+ } while (VP8IteratorRotateI4(it, best_blocks));
+
+ // finalize state
+ CopyScore(rd, &rd_best);
+ VP8SetIntra4Mode(it, rd->modes_i4);
+ SwapOut(it);
+ memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
+ return 1; // select intra4x4 over intra16x16
+}
+
+//------------------------------------------------------------------------------
+
+static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
+ const int kNumBlocks = 8;
+ const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+ const int lambda = dqm->lambda_uv_;
+ const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+ uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
+ uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
+ uint8_t* dst = dst0;
+ VP8ModeScore rd_best;
+ int mode;
+
+ rd->mode_uv = -1;
+ InitScore(&rd_best);
+ for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+ VP8ModeScore rd_uv;
+
+ // Reconstruct
+ rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
+
+ // Compute RD-score
+ rd_uv.D = VP8SSE16x8(src, tmp_dst);
+ rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
+ rd_uv.H = VP8FixedCostsUV[mode];
+ rd_uv.R = VP8GetCostUV(it, &rd_uv);
+ if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
+ rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
+ }
+
+ SetRDScore(lambda, &rd_uv);
+ if (mode == 0 || rd_uv.score < rd_best.score) {
+ CopyScore(&rd_best, &rd_uv);
+ rd->mode_uv = mode;
+ memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
+ if (it->top_derr_ != NULL) {
+ memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
+ }
+ SwapPtr(&dst, &tmp_dst);
+ }
+ }
+ VP8SetIntraUVMode(it, rd->mode_uv);
+ AddScore(rd, &rd_best);
+ if (dst != dst0) { // copy 16x8 block if needed
+ VP8Copy16x8(dst, dst0);
+ }
+ if (it->top_derr_ != NULL) { // store diffusion errors for next block
+ StoreDiffusionErrors(it, rd);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Final reconstruction and quantization.
+
+static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
+ const VP8Encoder* const enc = it->enc_;
+ const int is_i16 = (it->mb_->type_ == 1);
+ int nz = 0;
+
+ if (is_i16) {
+ nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
+ } else {
+ VP8IteratorStartI4(it);
+ do {
+ const int mode =
+ it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
+ const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
+ uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
+ VP8MakeIntra4Preds(it);
+ nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
+ src, dst, mode) << it->i4_;
+ } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
+ }
+
+ nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
+ rd->nz = nz;
+}
+
+// Refine intra16/intra4 sub-modes based on distortion only (not rate).
+static void RefineUsingDistortion(VP8EncIterator* const it,
+ int try_both_modes, int refine_uv_mode,
+ VP8ModeScore* const rd) {
+ score_t best_score = MAX_COST;
+ int nz = 0;
+ int mode;
+ int is_i16 = try_both_modes || (it->mb_->type_ == 1);
+
+ const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+ // Some empiric constants, of approximate order of magnitude.
+ const int lambda_d_i16 = 106;
+ const int lambda_d_i4 = 11;
+ const int lambda_d_uv = 120;
+ score_t score_i4 = dqm->i4_penalty_;
+ score_t i4_bit_sum = 0;
+ const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
+ : MAX_COST; // no early-out allowed
+
+ if (is_i16) { // First, evaluate Intra16 distortion
+ int best_mode = -1;
+ const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+ for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+ const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
+ const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
+ + VP8FixedCostsI16[mode] * lambda_d_i16;
+ if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
+ continue;
+ }
+
+ if (score < best_score) {
+ best_mode = mode;
+ best_score = score;
+ }
+ }
+ if (it->x_ == 0 || it->y_ == 0) {
+ // avoid starting a checkerboard resonance from the border. See bug #432.
+ if (IsFlatSource16(src)) {
+ best_mode = (it->x_ == 0) ? 0 : 2;
+ try_both_modes = 0; // stick to i16
+ }
+ }
+ VP8SetIntra16Mode(it, best_mode);
+ // we'll reconstruct later, if i16 mode actually gets selected
+ }
+
+ // Next, evaluate Intra4
+ if (try_both_modes || !is_i16) {
+ // We don't evaluate the rate here, but just account for it through a
+ // constant penalty (i4 mode usually needs more bits compared to i16).
+ is_i16 = 0;
+ VP8IteratorStartI4(it);
+ do {
+ int best_i4_mode = -1;
+ score_t best_i4_score = MAX_COST;
+ const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
+ const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
+
+ VP8MakeIntra4Preds(it);
+ for (mode = 0; mode < NUM_BMODES; ++mode) {
+ const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
+ const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
+ + mode_costs[mode] * lambda_d_i4;
+ if (score < best_i4_score) {
+ best_i4_mode = mode;
+ best_i4_score = score;
+ }
+ }
+ i4_bit_sum += mode_costs[best_i4_mode];
+ rd->modes_i4[it->i4_] = best_i4_mode;
+ score_i4 += best_i4_score;
+ if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
+ // Intra4 won't be better than Intra16. Bail out and pick Intra16.
+ is_i16 = 1;
+ break;
+ } else { // reconstruct partial block inside yuv_out2_ buffer
+ uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
+ nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
+ src, tmp_dst, best_i4_mode) << it->i4_;
+ }
+ } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
+ }
+
+ // Final reconstruction, depending on which mode is selected.
+ if (!is_i16) {
+ VP8SetIntra4Mode(it, rd->modes_i4);
+ SwapOut(it);
+ best_score = score_i4;
+ } else {
+ nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
+ }
+
+ // ... and UV!
+ if (refine_uv_mode) {
+ int best_mode = -1;
+ score_t best_uv_score = MAX_COST;
+ const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+ for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+ const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
+ const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
+ + VP8FixedCostsUV[mode] * lambda_d_uv;
+ if (score < best_uv_score) {
+ best_mode = mode;
+ best_uv_score = score;
+ }
+ }
+ VP8SetIntraUVMode(it, best_mode);
+ }
+ nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
+
+ rd->nz = nz;
+ rd->score = best_score;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+ VP8RDLevel rd_opt) {
+ int is_skipped;
+ const int method = it->enc_->method_;
+
+ InitScore(rd);
+
+ // We can perform predictions for Luma16x16 and Chroma8x8 already.
+ // Luma4x4 predictions needs to be done as-we-go.
+ VP8MakeLuma16Preds(it);
+ VP8MakeChroma8Preds(it);
+
+ if (rd_opt > RD_OPT_NONE) {
+ it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
+ PickBestIntra16(it, rd);
+ if (method >= 2) {
+ PickBestIntra4(it, rd);
+ }
+ PickBestUV(it, rd);
+ if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
+ it->do_trellis_ = 1;
+ SimpleQuantize(it, rd);
+ }
+ } else {
+ // At this point we have heuristically decided intra16 / intra4.
+ // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
+ // For method <= 1, we don't re-examine the decision but just go ahead with
+ // quantization/reconstruction.
+ RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
+ }
+ is_skipped = (rd->nz == 0);
+ VP8SetSkip(it, is_skipped);
+ return is_skipped;
+}
diff --git a/media/libwebp/enc/syntax_enc.c b/media/libwebp/enc/syntax_enc.c
new file mode 100644
index 0000000000..28fd1f1ee0
--- /dev/null
+++ b/media/libwebp/enc/syntax_enc.c
@@ -0,0 +1,388 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Header syntax writing
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+
+#include "../utils/utils.h"
+#include "../webp/format_constants.h" // RIFF constants
+#include "../webp/mux_types.h" // ALPHA_FLAG
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Helper functions
+
+static int IsVP8XNeeded(const VP8Encoder* const enc) {
+ return !!enc->has_alpha_; // Currently the only case when VP8X is needed.
+ // This could change in the future.
+}
+
+static int PutPaddingByte(const WebPPicture* const pic) {
+ const uint8_t pad_byte[1] = { 0 };
+ return !!pic->writer(pad_byte, 1, pic);
+}
+
+//------------------------------------------------------------------------------
+// Writers for header's various pieces (in order of appearance)
+
+static WebPEncodingError PutRIFFHeader(const VP8Encoder* const enc,
+ size_t riff_size) {
+ const WebPPicture* const pic = enc->pic_;
+ uint8_t riff[RIFF_HEADER_SIZE] = {
+ 'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P'
+ };
+ assert(riff_size == (uint32_t)riff_size);
+ PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+ if (!pic->writer(riff, sizeof(riff), pic)) {
+ return VP8_ENC_ERROR_BAD_WRITE;
+ }
+ return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8XHeader(const VP8Encoder* const enc) {
+ const WebPPicture* const pic = enc->pic_;
+ uint8_t vp8x[CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE] = {
+ 'V', 'P', '8', 'X'
+ };
+ uint32_t flags = 0;
+
+ assert(IsVP8XNeeded(enc));
+ assert(pic->width >= 1 && pic->height >= 1);
+ assert(pic->width <= MAX_CANVAS_SIZE && pic->height <= MAX_CANVAS_SIZE);
+
+ if (enc->has_alpha_) {
+ flags |= ALPHA_FLAG;
+ }
+
+ PutLE32(vp8x + TAG_SIZE, VP8X_CHUNK_SIZE);
+ PutLE32(vp8x + CHUNK_HEADER_SIZE, flags);
+ PutLE24(vp8x + CHUNK_HEADER_SIZE + 4, pic->width - 1);
+ PutLE24(vp8x + CHUNK_HEADER_SIZE + 7, pic->height - 1);
+ if (!pic->writer(vp8x, sizeof(vp8x), pic)) {
+ return VP8_ENC_ERROR_BAD_WRITE;
+ }
+ return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutAlphaChunk(const VP8Encoder* const enc) {
+ const WebPPicture* const pic = enc->pic_;
+ uint8_t alpha_chunk_hdr[CHUNK_HEADER_SIZE] = {
+ 'A', 'L', 'P', 'H'
+ };
+
+ assert(enc->has_alpha_);
+
+ // Alpha chunk header.
+ PutLE32(alpha_chunk_hdr + TAG_SIZE, enc->alpha_data_size_);
+ if (!pic->writer(alpha_chunk_hdr, sizeof(alpha_chunk_hdr), pic)) {
+ return VP8_ENC_ERROR_BAD_WRITE;
+ }
+
+ // Alpha chunk data.
+ if (!pic->writer(enc->alpha_data_, enc->alpha_data_size_, pic)) {
+ return VP8_ENC_ERROR_BAD_WRITE;
+ }
+
+ // Padding.
+ if ((enc->alpha_data_size_ & 1) && !PutPaddingByte(pic)) {
+ return VP8_ENC_ERROR_BAD_WRITE;
+ }
+ return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8Header(const WebPPicture* const pic,
+ size_t vp8_size) {
+ uint8_t vp8_chunk_hdr[CHUNK_HEADER_SIZE] = {
+ 'V', 'P', '8', ' '
+ };
+ assert(vp8_size == (uint32_t)vp8_size);
+ PutLE32(vp8_chunk_hdr + TAG_SIZE, (uint32_t)vp8_size);
+ if (!pic->writer(vp8_chunk_hdr, sizeof(vp8_chunk_hdr), pic)) {
+ return VP8_ENC_ERROR_BAD_WRITE;
+ }
+ return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8FrameHeader(const WebPPicture* const pic,
+ int profile, size_t size0) {
+ uint8_t vp8_frm_hdr[VP8_FRAME_HEADER_SIZE];
+ uint32_t bits;
+
+ if (size0 >= VP8_MAX_PARTITION0_SIZE) { // partition #0 is too big to fit
+ return VP8_ENC_ERROR_PARTITION0_OVERFLOW;
+ }
+
+ // Paragraph 9.1.
+ bits = 0 // keyframe (1b)
+ | (profile << 1) // profile (3b)
+ | (1 << 4) // visible (1b)
+ | ((uint32_t)size0 << 5); // partition length (19b)
+ vp8_frm_hdr[0] = (bits >> 0) & 0xff;
+ vp8_frm_hdr[1] = (bits >> 8) & 0xff;
+ vp8_frm_hdr[2] = (bits >> 16) & 0xff;
+ // signature
+ vp8_frm_hdr[3] = (VP8_SIGNATURE >> 16) & 0xff;
+ vp8_frm_hdr[4] = (VP8_SIGNATURE >> 8) & 0xff;
+ vp8_frm_hdr[5] = (VP8_SIGNATURE >> 0) & 0xff;
+ // dimensions
+ vp8_frm_hdr[6] = pic->width & 0xff;
+ vp8_frm_hdr[7] = pic->width >> 8;
+ vp8_frm_hdr[8] = pic->height & 0xff;
+ vp8_frm_hdr[9] = pic->height >> 8;
+
+ if (!pic->writer(vp8_frm_hdr, sizeof(vp8_frm_hdr), pic)) {
+ return VP8_ENC_ERROR_BAD_WRITE;
+ }
+ return VP8_ENC_OK;
+}
+
+// WebP Headers.
+static int PutWebPHeaders(const VP8Encoder* const enc, size_t size0,
+ size_t vp8_size, size_t riff_size) {
+ WebPPicture* const pic = enc->pic_;
+ WebPEncodingError err = VP8_ENC_OK;
+
+ // RIFF header.
+ err = PutRIFFHeader(enc, riff_size);
+ if (err != VP8_ENC_OK) goto Error;
+
+ // VP8X.
+ if (IsVP8XNeeded(enc)) {
+ err = PutVP8XHeader(enc);
+ if (err != VP8_ENC_OK) goto Error;
+ }
+
+ // Alpha.
+ if (enc->has_alpha_) {
+ err = PutAlphaChunk(enc);
+ if (err != VP8_ENC_OK) goto Error;
+ }
+
+ // VP8 header.
+ err = PutVP8Header(pic, vp8_size);
+ if (err != VP8_ENC_OK) goto Error;
+
+ // VP8 frame header.
+ err = PutVP8FrameHeader(pic, enc->profile_, size0);
+ if (err != VP8_ENC_OK) goto Error;
+
+ // All OK.
+ return 1;
+
+ // Error.
+ Error:
+ return WebPEncodingSetError(pic, err);
+}
+
+// Segmentation header
+static void PutSegmentHeader(VP8BitWriter* const bw,
+ const VP8Encoder* const enc) {
+ const VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
+ const VP8EncProba* const proba = &enc->proba_;
+ if (VP8PutBitUniform(bw, (hdr->num_segments_ > 1))) {
+ // We always 'update' the quant and filter strength values
+ const int update_data = 1;
+ int s;
+ VP8PutBitUniform(bw, hdr->update_map_);
+ if (VP8PutBitUniform(bw, update_data)) {
+ // we always use absolute values, not relative ones
+ VP8PutBitUniform(bw, 1); // (segment_feature_mode = 1. Paragraph 9.3.)
+ for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+ VP8PutSignedBits(bw, enc->dqm_[s].quant_, 7);
+ }
+ for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+ VP8PutSignedBits(bw, enc->dqm_[s].fstrength_, 6);
+ }
+ }
+ if (hdr->update_map_) {
+ for (s = 0; s < 3; ++s) {
+ if (VP8PutBitUniform(bw, (proba->segments_[s] != 255u))) {
+ VP8PutBits(bw, proba->segments_[s], 8);
+ }
+ }
+ }
+ }
+}
+
+// Filtering parameters header
+static void PutFilterHeader(VP8BitWriter* const bw,
+ const VP8EncFilterHeader* const hdr) {
+ const int use_lf_delta = (hdr->i4x4_lf_delta_ != 0);
+ VP8PutBitUniform(bw, hdr->simple_);
+ VP8PutBits(bw, hdr->level_, 6);
+ VP8PutBits(bw, hdr->sharpness_, 3);
+ if (VP8PutBitUniform(bw, use_lf_delta)) {
+ // '0' is the default value for i4x4_lf_delta_ at frame #0.
+ const int need_update = (hdr->i4x4_lf_delta_ != 0);
+ if (VP8PutBitUniform(bw, need_update)) {
+ // we don't use ref_lf_delta => emit four 0 bits
+ VP8PutBits(bw, 0, 4);
+ // we use mode_lf_delta for i4x4
+ VP8PutSignedBits(bw, hdr->i4x4_lf_delta_, 6);
+ VP8PutBits(bw, 0, 3); // all others unused
+ }
+ }
+}
+
+// Nominal quantization parameters
+static void PutQuant(VP8BitWriter* const bw,
+ const VP8Encoder* const enc) {
+ VP8PutBits(bw, enc->base_quant_, 7);
+ VP8PutSignedBits(bw, enc->dq_y1_dc_, 4);
+ VP8PutSignedBits(bw, enc->dq_y2_dc_, 4);
+ VP8PutSignedBits(bw, enc->dq_y2_ac_, 4);
+ VP8PutSignedBits(bw, enc->dq_uv_dc_, 4);
+ VP8PutSignedBits(bw, enc->dq_uv_ac_, 4);
+}
+
+// Partition sizes
+static int EmitPartitionsSize(const VP8Encoder* const enc,
+ WebPPicture* const pic) {
+ uint8_t buf[3 * (MAX_NUM_PARTITIONS - 1)];
+ int p;
+ for (p = 0; p < enc->num_parts_ - 1; ++p) {
+ const size_t part_size = VP8BitWriterSize(enc->parts_ + p);
+ if (part_size >= VP8_MAX_PARTITION_SIZE) {
+ return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION_OVERFLOW);
+ }
+ buf[3 * p + 0] = (part_size >> 0) & 0xff;
+ buf[3 * p + 1] = (part_size >> 8) & 0xff;
+ buf[3 * p + 2] = (part_size >> 16) & 0xff;
+ }
+ return p ? pic->writer(buf, 3 * p, pic) : 1;
+}
+
+//------------------------------------------------------------------------------
+
+static int GeneratePartition0(VP8Encoder* const enc) {
+ VP8BitWriter* const bw = &enc->bw_;
+ const int mb_size = enc->mb_w_ * enc->mb_h_;
+ uint64_t pos1, pos2, pos3;
+
+ pos1 = VP8BitWriterPos(bw);
+ if (!VP8BitWriterInit(bw, mb_size * 7 / 8)) { // ~7 bits per macroblock
+ return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+ }
+ VP8PutBitUniform(bw, 0); // colorspace
+ VP8PutBitUniform(bw, 0); // clamp type
+
+ PutSegmentHeader(bw, enc);
+ PutFilterHeader(bw, &enc->filter_hdr_);
+ VP8PutBits(bw, enc->num_parts_ == 8 ? 3 :
+ enc->num_parts_ == 4 ? 2 :
+ enc->num_parts_ == 2 ? 1 : 0, 2);
+ PutQuant(bw, enc);
+ VP8PutBitUniform(bw, 0); // no proba update
+ VP8WriteProbas(bw, &enc->proba_);
+ pos2 = VP8BitWriterPos(bw);
+ VP8CodeIntraModes(enc);
+ VP8BitWriterFinish(bw);
+
+ pos3 = VP8BitWriterPos(bw);
+
+#if !defined(WEBP_DISABLE_STATS)
+ if (enc->pic_->stats) {
+ enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
+ enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
+ enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
+ }
+#else
+ (void)pos1;
+ (void)pos2;
+ (void)pos3;
+#endif
+ if (bw->error_) {
+ return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+ }
+ return 1;
+}
+
+void VP8EncFreeBitWriters(VP8Encoder* const enc) {
+ int p;
+ VP8BitWriterWipeOut(&enc->bw_);
+ for (p = 0; p < enc->num_parts_; ++p) {
+ VP8BitWriterWipeOut(enc->parts_ + p);
+ }
+}
+
+int VP8EncWrite(VP8Encoder* const enc) {
+ WebPPicture* const pic = enc->pic_;
+ VP8BitWriter* const bw = &enc->bw_;
+ const int task_percent = 19;
+ const int percent_per_part = task_percent / enc->num_parts_;
+ const int final_percent = enc->percent_ + task_percent;
+ int ok = 0;
+ size_t vp8_size, pad, riff_size;
+ int p;
+
+ // Partition #0 with header and partition sizes
+ ok = GeneratePartition0(enc);
+ if (!ok) return 0;
+
+ // Compute VP8 size
+ vp8_size = VP8_FRAME_HEADER_SIZE +
+ VP8BitWriterSize(bw) +
+ 3 * (enc->num_parts_ - 1);
+ for (p = 0; p < enc->num_parts_; ++p) {
+ vp8_size += VP8BitWriterSize(enc->parts_ + p);
+ }
+ pad = vp8_size & 1;
+ vp8_size += pad;
+
+ // Compute RIFF size
+ // At the minimum it is: "WEBPVP8 nnnn" + VP8 data size.
+ riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8_size;
+ if (IsVP8XNeeded(enc)) { // Add size for: VP8X header + data.
+ riff_size += CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
+ }
+ if (enc->has_alpha_) { // Add size for: ALPH header + data.
+ const uint32_t padded_alpha_size = enc->alpha_data_size_ +
+ (enc->alpha_data_size_ & 1);
+ riff_size += CHUNK_HEADER_SIZE + padded_alpha_size;
+ }
+ // RIFF size should fit in 32-bits.
+ if (riff_size > 0xfffffffeU) {
+ return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
+ }
+
+ // Emit headers and partition #0
+ {
+ const uint8_t* const part0 = VP8BitWriterBuf(bw);
+ const size_t size0 = VP8BitWriterSize(bw);
+ ok = ok && PutWebPHeaders(enc, size0, vp8_size, riff_size)
+ && pic->writer(part0, size0, pic)
+ && EmitPartitionsSize(enc, pic);
+ VP8BitWriterWipeOut(bw); // will free the internal buffer.
+ }
+
+ // Token partitions
+ for (p = 0; p < enc->num_parts_; ++p) {
+ const uint8_t* const buf = VP8BitWriterBuf(enc->parts_ + p);
+ const size_t size = VP8BitWriterSize(enc->parts_ + p);
+ if (size) ok = ok && pic->writer(buf, size, pic);
+ VP8BitWriterWipeOut(enc->parts_ + p); // will free the internal buffer.
+ ok = ok && WebPReportProgress(pic, enc->percent_ + percent_per_part,
+ &enc->percent_);
+ }
+
+ // Padding byte
+ if (ok && pad) {
+ ok = PutPaddingByte(pic);
+ }
+
+ enc->coded_size_ = (int)(CHUNK_HEADER_SIZE + riff_size);
+ ok = ok && WebPReportProgress(pic, final_percent, &enc->percent_);
+ return ok;
+}
+
+//------------------------------------------------------------------------------
+
diff --git a/media/libwebp/enc/token_enc.c b/media/libwebp/enc/token_enc.c
new file mode 100644
index 0000000000..52711eb782
--- /dev/null
+++ b/media/libwebp/enc/token_enc.c
@@ -0,0 +1,262 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Paginated token buffer
+//
+// A 'token' is a bit value associated with a probability, either fixed
+// or a later-to-be-determined after statistics have been collected.
+// For dynamic probability, we just record the slot id (idx) for the probability
+// value in the final probability array (uint8_t* probas in VP8EmitTokens).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+// we use pages to reduce the number of memcpy()
+#define MIN_PAGE_SIZE 8192 // minimum number of token per page
+#define FIXED_PROBA_BIT (1u << 14)
+
+typedef uint16_t token_t; // bit #15: bit value
+ // bit #14: flags for constant proba or idx
+ // bits #0..13: slot or constant proba
+struct VP8Tokens {
+ VP8Tokens* next_; // pointer to next page
+};
+// Token data is located in memory just after the next_ field.
+// This macro is used to return their address and hide the trick.
+#define TOKEN_DATA(p) ((const token_t*)&(p)[1])
+
+//------------------------------------------------------------------------------
+
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
+ b->tokens_ = NULL;
+ b->pages_ = NULL;
+ b->last_page_ = &b->pages_;
+ b->left_ = 0;
+ b->page_size_ = (page_size < MIN_PAGE_SIZE) ? MIN_PAGE_SIZE : page_size;
+ b->error_ = 0;
+}
+
+void VP8TBufferClear(VP8TBuffer* const b) {
+ if (b != NULL) {
+ VP8Tokens* p = b->pages_;
+ while (p != NULL) {
+ VP8Tokens* const next = p->next_;
+ WebPSafeFree(p);
+ p = next;
+ }
+ VP8TBufferInit(b, b->page_size_);
+ }
+}
+
+static int TBufferNewPage(VP8TBuffer* const b) {
+ VP8Tokens* page = NULL;
+ if (!b->error_) {
+ const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
+ page = (VP8Tokens*)WebPSafeMalloc(1ULL, size);
+ }
+ if (page == NULL) {
+ b->error_ = 1;
+ return 0;
+ }
+ page->next_ = NULL;
+
+ *b->last_page_ = page;
+ b->last_page_ = &page->next_;
+ b->left_ = b->page_size_;
+ b->tokens_ = (token_t*)TOKEN_DATA(page);
+ return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#define TOKEN_ID(t, b, ctx) \
+ (NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
+
+static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b, uint32_t bit,
+ uint32_t proba_idx,
+ proba_t* const stats) {
+ assert(proba_idx < FIXED_PROBA_BIT);
+ assert(bit <= 1);
+ if (b->left_ > 0 || TBufferNewPage(b)) {
+ const int slot = --b->left_;
+ b->tokens_[slot] = (bit << 15) | proba_idx;
+ }
+ VP8RecordStats(bit, stats);
+ return bit;
+}
+
+static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
+ uint32_t bit, uint32_t proba) {
+ assert(proba < 256);
+ assert(bit <= 1);
+ if (b->left_ > 0 || TBufferNewPage(b)) {
+ const int slot = --b->left_;
+ b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
+ }
+}
+
+int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
+ VP8TBuffer* const tokens) {
+ const int16_t* const coeffs = res->coeffs;
+ const int coeff_type = res->coeff_type;
+ const int last = res->last;
+ int n = res->first;
+ uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
+ // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+ proba_t* s = res->stats[n][ctx];
+ if (!AddToken(tokens, last >= 0, base_id + 0, s + 0)) {
+ return 0;
+ }
+
+ while (n < 16) {
+ const int c = coeffs[n++];
+ const int sign = c < 0;
+ const uint32_t v = sign ? -c : c;
+ if (!AddToken(tokens, v != 0, base_id + 1, s + 1)) {
+ base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0); // ctx=0
+ s = res->stats[VP8EncBands[n]][0];
+ continue;
+ }
+ if (!AddToken(tokens, v > 1, base_id + 2, s + 2)) {
+ base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1); // ctx=1
+ s = res->stats[VP8EncBands[n]][1];
+ } else {
+ if (!AddToken(tokens, v > 4, base_id + 3, s + 3)) {
+ if (AddToken(tokens, v != 2, base_id + 4, s + 4)) {
+ AddToken(tokens, v == 4, base_id + 5, s + 5);
+ }
+ } else if (!AddToken(tokens, v > 10, base_id + 6, s + 6)) {
+ if (!AddToken(tokens, v > 6, base_id + 7, s + 7)) {
+ AddConstantToken(tokens, v == 6, 159);
+ } else {
+ AddConstantToken(tokens, v >= 9, 165);
+ AddConstantToken(tokens, !(v & 1), 145);
+ }
+ } else {
+ int mask;
+ const uint8_t* tab;
+ uint32_t residue = v - 3;
+ if (residue < (8 << 1)) { // VP8Cat3 (3b)
+ AddToken(tokens, 0, base_id + 8, s + 8);
+ AddToken(tokens, 0, base_id + 9, s + 9);
+ residue -= (8 << 0);
+ mask = 1 << 2;
+ tab = VP8Cat3;
+ } else if (residue < (8 << 2)) { // VP8Cat4 (4b)
+ AddToken(tokens, 0, base_id + 8, s + 8);
+ AddToken(tokens, 1, base_id + 9, s + 9);
+ residue -= (8 << 1);
+ mask = 1 << 3;
+ tab = VP8Cat4;
+ } else if (residue < (8 << 3)) { // VP8Cat5 (5b)
+ AddToken(tokens, 1, base_id + 8, s + 8);
+ AddToken(tokens, 0, base_id + 10, s + 9);
+ residue -= (8 << 2);
+ mask = 1 << 4;
+ tab = VP8Cat5;
+ } else { // VP8Cat6 (11b)
+ AddToken(tokens, 1, base_id + 8, s + 8);
+ AddToken(tokens, 1, base_id + 10, s + 9);
+ residue -= (8 << 3);
+ mask = 1 << 10;
+ tab = VP8Cat6;
+ }
+ while (mask) {
+ AddConstantToken(tokens, !!(residue & mask), *tab++);
+ mask >>= 1;
+ }
+ }
+ base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2); // ctx=2
+ s = res->stats[VP8EncBands[n]][2];
+ }
+ AddConstantToken(tokens, sign, 128);
+ if (n == 16 || !AddToken(tokens, n <= last, base_id + 0, s + 0)) {
+ return 1; // EOB
+ }
+ }
+ return 1;
+}
+
+#undef TOKEN_ID
+
+//------------------------------------------------------------------------------
+// Final coding pass, with known probabilities
+
+int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
+ const uint8_t* const probas, int final_pass) {
+ const VP8Tokens* p = b->pages_;
+ assert(!b->error_);
+ while (p != NULL) {
+ const VP8Tokens* const next = p->next_;
+ const int N = (next == NULL) ? b->left_ : 0;
+ int n = b->page_size_;
+ const token_t* const tokens = TOKEN_DATA(p);
+ while (n-- > N) {
+ const token_t token = tokens[n];
+ const int bit = (token >> 15) & 1;
+ if (token & FIXED_PROBA_BIT) {
+ VP8PutBit(bw, bit, token & 0xffu); // constant proba
+ } else {
+ VP8PutBit(bw, bit, probas[token & 0x3fffu]);
+ }
+ }
+ if (final_pass) WebPSafeFree((void*)p);
+ p = next;
+ }
+ if (final_pass) b->pages_ = NULL;
+ return 1;
+}
+
+// Size estimation
+size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
+ size_t size = 0;
+ const VP8Tokens* p = b->pages_;
+ assert(!b->error_);
+ while (p != NULL) {
+ const VP8Tokens* const next = p->next_;
+ const int N = (next == NULL) ? b->left_ : 0;
+ int n = b->page_size_;
+ const token_t* const tokens = TOKEN_DATA(p);
+ while (n-- > N) {
+ const token_t token = tokens[n];
+ const int bit = token & (1 << 15);
+ if (token & FIXED_PROBA_BIT) {
+ size += VP8BitCost(bit, token & 0xffu);
+ } else {
+ size += VP8BitCost(bit, probas[token & 0x3fffu]);
+ }
+ }
+ p = next;
+ }
+ return size;
+}
+
+//------------------------------------------------------------------------------
+
+#else // DISABLE_TOKEN_BUFFER
+
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
+ (void)b;
+ (void)page_size;
+}
+void VP8TBufferClear(VP8TBuffer* const b) {
+ (void)b;
+}
+
+#endif // !DISABLE_TOKEN_BUFFER
+
diff --git a/media/libwebp/enc/tree_enc.c b/media/libwebp/enc/tree_enc.c
new file mode 100644
index 0000000000..7bf9b47d08
--- /dev/null
+++ b/media/libwebp/enc/tree_enc.c
@@ -0,0 +1,504 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Coding of token probabilities, intra modes and segments.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Default probabilities
+
+// Paragraph 13.5
+const uint8_t
+ VP8CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+ { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+ { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+ { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+ },
+ { { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+ { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+ { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+ },
+ { { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+ { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+ { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+ },
+ { { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+ { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+ { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+ },
+ { { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+ { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+ { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+ },
+ { { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+ { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+ { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+ },
+ { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ },
+ { { { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+ { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+ { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+ },
+ { { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+ { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+ { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+ },
+ { { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+ { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+ { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+ },
+ { { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+ { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+ { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+ },
+ { { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+ { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+ { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+ },
+ { { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+ { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+ { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+ },
+ { { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+ { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+ { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+ },
+ { { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+ }
+ },
+ { { { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+ { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+ { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+ },
+ { { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+ { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+ { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+ },
+ { { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+ { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+ { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+ },
+ { { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+ },
+ { { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+ { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ },
+ { { { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+ { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+ { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+ },
+ { { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+ { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+ { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+ },
+ { { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+ { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+ { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+ },
+ { { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+ { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+ { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+ },
+ { { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+ { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+ { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+ },
+ { { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+ { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+ { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+ },
+ { { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+ { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+ { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+ },
+ { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ }
+};
+
+void VP8DefaultProbas(VP8Encoder* const enc) {
+ VP8EncProba* const probas = &enc->proba_;
+ probas->use_skip_proba_ = 0;
+ memset(probas->segments_, 255u, sizeof(probas->segments_));
+ memcpy(probas->coeffs_, VP8CoeffsProba0, sizeof(VP8CoeffsProba0));
+ // Note: we could hard-code the level_costs_ corresponding to VP8CoeffsProba0,
+ // but that's ~11k of static data. Better call VP8CalculateLevelCosts() later.
+ probas->dirty_ = 1;
+}
+
+// Paragraph 11.5. 900bytes.
+static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
+ { { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
+ { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
+ { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
+ { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
+ { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
+ { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+ { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
+ { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
+ { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
+ { 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
+ { { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
+ { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
+ { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
+ { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
+ { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
+ { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+ { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
+ { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
+ { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
+ { 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
+ { { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
+ { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
+ { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
+ { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
+ { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
+ { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+ { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
+ { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
+ { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
+ { 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
+ { { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
+ { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
+ { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
+ { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
+ { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
+ { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+ { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
+ { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
+ { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
+ { 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
+ { { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
+ { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
+ { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
+ { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
+ { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
+ { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+ { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
+ { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
+ { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
+ { 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
+ { { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
+ { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
+ { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
+ { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
+ { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
+ { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+ { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
+ { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
+ { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
+ { 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
+ { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+ { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+ { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+ { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+ { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+ { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+ { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+ { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+ { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+ { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
+ { { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
+ { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
+ { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
+ { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
+ { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
+ { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+ { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
+ { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
+ { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
+ { 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
+ { { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
+ { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
+ { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
+ { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
+ { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
+ { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+ { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
+ { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
+ { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
+ { 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
+ { { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
+ { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
+ { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
+ { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
+ { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
+ { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+ { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
+ { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
+ { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
+ { 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
+};
+
+static int PutI4Mode(VP8BitWriter* const bw, int mode,
+ const uint8_t* const prob) {
+ if (VP8PutBit(bw, mode != B_DC_PRED, prob[0])) {
+ if (VP8PutBit(bw, mode != B_TM_PRED, prob[1])) {
+ if (VP8PutBit(bw, mode != B_VE_PRED, prob[2])) {
+ if (!VP8PutBit(bw, mode >= B_LD_PRED, prob[3])) {
+ if (VP8PutBit(bw, mode != B_HE_PRED, prob[4])) {
+ VP8PutBit(bw, mode != B_RD_PRED, prob[5]);
+ }
+ } else {
+ if (VP8PutBit(bw, mode != B_LD_PRED, prob[6])) {
+ if (VP8PutBit(bw, mode != B_VL_PRED, prob[7])) {
+ VP8PutBit(bw, mode != B_HD_PRED, prob[8]);
+ }
+ }
+ }
+ }
+ }
+ }
+ return mode;
+}
+
+static void PutI16Mode(VP8BitWriter* const bw, int mode) {
+ if (VP8PutBit(bw, (mode == TM_PRED || mode == H_PRED), 156)) {
+ VP8PutBit(bw, mode == TM_PRED, 128); // TM or HE
+ } else {
+ VP8PutBit(bw, mode == V_PRED, 163); // VE or DC
+ }
+}
+
+static void PutUVMode(VP8BitWriter* const bw, int uv_mode) {
+ if (VP8PutBit(bw, uv_mode != DC_PRED, 142)) {
+ if (VP8PutBit(bw, uv_mode != V_PRED, 114)) {
+ VP8PutBit(bw, uv_mode != H_PRED, 183); // else: TM_PRED
+ }
+ }
+}
+
+static void PutSegment(VP8BitWriter* const bw, int s, const uint8_t* p) {
+ if (VP8PutBit(bw, s >= 2, p[0])) p += 1;
+ VP8PutBit(bw, s & 1, p[1]);
+}
+
+void VP8CodeIntraModes(VP8Encoder* const enc) {
+ VP8BitWriter* const bw = &enc->bw_;
+ VP8EncIterator it;
+ VP8IteratorInit(enc, &it);
+ do {
+ const VP8MBInfo* const mb = it.mb_;
+ const uint8_t* preds = it.preds_;
+ if (enc->segment_hdr_.update_map_) {
+ PutSegment(bw, mb->segment_, enc->proba_.segments_);
+ }
+ if (enc->proba_.use_skip_proba_) {
+ VP8PutBit(bw, mb->skip_, enc->proba_.skip_proba_);
+ }
+ if (VP8PutBit(bw, (mb->type_ != 0), 145)) { // i16x16
+ PutI16Mode(bw, preds[0]);
+ } else {
+ const int preds_w = enc->preds_w_;
+ const uint8_t* top_pred = preds - preds_w;
+ int x, y;
+ for (y = 0; y < 4; ++y) {
+ int left = preds[-1];
+ for (x = 0; x < 4; ++x) {
+ const uint8_t* const probas = kBModesProba[top_pred[x]][left];
+ left = PutI4Mode(bw, preds[x], probas);
+ }
+ top_pred = preds;
+ preds += preds_w;
+ }
+ }
+ PutUVMode(bw, mb->uv_mode_);
+ } while (VP8IteratorNext(&it));
+}
+
+//------------------------------------------------------------------------------
+// Paragraph 13
+
+const uint8_t
+ VP8CoeffsUpdateProba[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+ { { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
+ { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ }
+ },
+ { { { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
+ { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }
+ },
+ { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ }
+ },
+ { { { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
+ { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }
+ },
+ { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ }
+ },
+ { { { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
+ { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ },
+ { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+ }
+ }
+};
+
+void VP8WriteProbas(VP8BitWriter* const bw, const VP8EncProba* const probas) {
+ int t, b, c, p;
+ for (t = 0; t < NUM_TYPES; ++t) {
+ for (b = 0; b < NUM_BANDS; ++b) {
+ for (c = 0; c < NUM_CTX; ++c) {
+ for (p = 0; p < NUM_PROBAS; ++p) {
+ const uint8_t p0 = probas->coeffs_[t][b][c][p];
+ const int update = (p0 != VP8CoeffsProba0[t][b][c][p]);
+ if (VP8PutBit(bw, update, VP8CoeffsUpdateProba[t][b][c][p])) {
+ VP8PutBits(bw, p0, 8);
+ }
+ }
+ }
+ }
+ }
+ if (VP8PutBitUniform(bw, probas->use_skip_proba_)) {
+ VP8PutBits(bw, probas->skip_proba_, 8);
+ }
+}
+
diff --git a/media/libwebp/enc/vp8i_enc.h b/media/libwebp/enc/vp8i_enc.h
index 009ccf2239..9bca205e01 100644
--- a/media/libwebp/enc/vp8i_enc.h
+++ b/media/libwebp/enc/vp8i_enc.h
@@ -31,7 +31,7 @@ extern "C" {
// version numbers
#define ENC_MAJ_VERSION 1
-#define ENC_MIN_VERSION 0
+#define ENC_MIN_VERSION 2
#define ENC_REV_VERSION 2
enum { MAX_LF_LEVELS = 64, // Maximum loop filter level
@@ -249,7 +249,7 @@ typedef struct {
int percent0_; // saved initial progress percent
DError left_derr_; // left error diffusion (u/v)
- DError *top_derr_; // top diffusion error - NULL if disabled
+ DError* top_derr_; // top diffusion error - NULL if disabled
uint8_t* y_left_; // left luma samples (addressable from index -1 to 15).
uint8_t* u_left_; // left u samples (addressable from index -1 to 7)
@@ -286,8 +286,7 @@ int VP8IteratorNext(VP8EncIterator* const it);
// save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
void VP8IteratorSaveBoundary(VP8EncIterator* const it);
// Report progression based on macroblock rows. Return 0 for user-abort request.
-int VP8IteratorProgress(const VP8EncIterator* const it,
- int final_delta_percent);
+int VP8IteratorProgress(const VP8EncIterator* const it, int delta);
// Intra4x4 iterations
void VP8IteratorStartI4(VP8EncIterator* const it);
// returns true if not done.
@@ -505,9 +504,9 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
// Returns false in case of error (invalid param, out-of-memory).
int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
-// Clean-up the RGB samples under fully transparent area, to help lossless
-// compressibility (no guarantee, though). Assumes that pic->use_argb is true.
-void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
+// Replace samples that are fully transparent by 'color' to help compressibility
+// (no guarantee, though). Assumes pic->use_argb is true.
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color);
//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/vp8l_enc.c b/media/libwebp/enc/vp8l_enc.c
new file mode 100644
index 0000000000..4aed5d8e32
--- /dev/null
+++ b/media/libwebp/enc/vp8l_enc.c
@@ -0,0 +1,2138 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// main entry for the lossless encoder.
+//
+// Author: Vikas Arora (vikaas.arora@gmail.com)
+//
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/vp8li_enc.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/bit_writer_utils.h"
+#include "../utils/huffman_encode_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// Maximum number of histogram images (sub-blocks).
+#define MAX_HUFF_IMAGE_SIZE 2600
+
+// Palette reordering for smaller sum of deltas (and for smaller storage).
+
+static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
+ const uint32_t a = WebPMemToUint32((uint8_t*)p1);
+ const uint32_t b = WebPMemToUint32((uint8_t*)p2);
+ assert(a != b);
+ return (a < b) ? -1 : 1;
+}
+
+static WEBP_INLINE uint32_t PaletteComponentDistance(uint32_t v) {
+ return (v <= 128) ? v : (256 - v);
+}
+
+// Computes a value that is related to the entropy created by the
+// palette entry diff.
+//
+// Note that the last & 0xff is a no-operation in the next statement, but
+// removed by most compilers and is here only for regularity of the code.
+static WEBP_INLINE uint32_t PaletteColorDistance(uint32_t col1, uint32_t col2) {
+ const uint32_t diff = VP8LSubPixels(col1, col2);
+ const int kMoreWeightForRGBThanForAlpha = 9;
+ uint32_t score;
+ score = PaletteComponentDistance((diff >> 0) & 0xff);
+ score += PaletteComponentDistance((diff >> 8) & 0xff);
+ score += PaletteComponentDistance((diff >> 16) & 0xff);
+ score *= kMoreWeightForRGBThanForAlpha;
+ score += PaletteComponentDistance((diff >> 24) & 0xff);
+ return score;
+}
+
+static WEBP_INLINE void SwapColor(uint32_t* const col1, uint32_t* const col2) {
+ const uint32_t tmp = *col1;
+ *col1 = *col2;
+ *col2 = tmp;
+}
+
+static WEBP_INLINE int SearchColorNoIdx(const uint32_t sorted[], uint32_t color,
+ int num_colors) {
+ int low = 0, hi = num_colors;
+ if (sorted[low] == color) return low; // loop invariant: sorted[low] != color
+ while (1) {
+ const int mid = (low + hi) >> 1;
+ if (sorted[mid] == color) {
+ return mid;
+ } else if (sorted[mid] < color) {
+ low = mid;
+ } else {
+ hi = mid;
+ }
+ }
+ assert(0);
+ return 0;
+}
+
+// The palette has been sorted by alpha. This function checks if the other
+// components of the palette have a monotonic development with regards to
+// position in the palette. If all have monotonic development, there is
+// no benefit to re-organize them greedily. A monotonic development
+// would be spotted in green-only situations (like lossy alpha) or gray-scale
+// images.
+static int PaletteHasNonMonotonousDeltas(const uint32_t* const palette,
+ int num_colors) {
+ uint32_t predict = 0x000000;
+ int i;
+ uint8_t sign_found = 0x00;
+ for (i = 0; i < num_colors; ++i) {
+ const uint32_t diff = VP8LSubPixels(palette[i], predict);
+ const uint8_t rd = (diff >> 16) & 0xff;
+ const uint8_t gd = (diff >> 8) & 0xff;
+ const uint8_t bd = (diff >> 0) & 0xff;
+ if (rd != 0x00) {
+ sign_found |= (rd < 0x80) ? 1 : 2;
+ }
+ if (gd != 0x00) {
+ sign_found |= (gd < 0x80) ? 8 : 16;
+ }
+ if (bd != 0x00) {
+ sign_found |= (bd < 0x80) ? 64 : 128;
+ }
+ predict = palette[i];
+ }
+ return (sign_found & (sign_found << 1)) != 0; // two consequent signs.
+}
+
+static void PaletteSortMinimizeDeltas(const uint32_t* const palette_sorted,
+ int num_colors, uint32_t* const palette) {
+ uint32_t predict = 0x00000000;
+ int i, k;
+ memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
+ if (!PaletteHasNonMonotonousDeltas(palette_sorted, num_colors)) return;
+ // Find greedily always the closest color of the predicted color to minimize
+ // deltas in the palette. This reduces storage needs since the
+ // palette is stored with delta encoding.
+ for (i = 0; i < num_colors; ++i) {
+ int best_ix = i;
+ uint32_t best_score = ~0U;
+ for (k = i; k < num_colors; ++k) {
+ const uint32_t cur_score = PaletteColorDistance(palette[k], predict);
+ if (best_score > cur_score) {
+ best_score = cur_score;
+ best_ix = k;
+ }
+ }
+ SwapColor(&palette[best_ix], &palette[i]);
+ predict = palette[i];
+ }
+}
+
+// Sort palette in increasing order and prepare an inverse mapping array.
+static void PrepareMapToPalette(const uint32_t palette[], uint32_t num_colors,
+ uint32_t sorted[], uint32_t idx_map[]) {
+ uint32_t i;
+ memcpy(sorted, palette, num_colors * sizeof(*sorted));
+ qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
+ for (i = 0; i < num_colors; ++i) {
+ idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Modified Zeng method from "A Survey on Palette Reordering
+// Methods for Improving the Compression of Color-Indexed Images" by Armando J.
+// Pinho and Antonio J. R. Neves.
+
+// Finds the biggest cooccurrence in the matrix.
+static void CoOccurrenceFindMax(const uint32_t* const cooccurrence,
+ uint32_t num_colors, uint8_t* const c1,
+ uint8_t* const c2) {
+ // Find the index that is most frequently located adjacent to other
+ // (different) indexes.
+ uint32_t best_sum = 0u;
+ uint32_t i, j, best_cooccurrence;
+ *c1 = 0u;
+ for (i = 0; i < num_colors; ++i) {
+ uint32_t sum = 0;
+ for (j = 0; j < num_colors; ++j) sum += cooccurrence[i * num_colors + j];
+ if (sum > best_sum) {
+ best_sum = sum;
+ *c1 = i;
+ }
+ }
+ // Find the index that is most frequently found adjacent to *c1.
+ *c2 = 0u;
+ best_cooccurrence = 0u;
+ for (i = 0; i < num_colors; ++i) {
+ if (cooccurrence[*c1 * num_colors + i] > best_cooccurrence) {
+ best_cooccurrence = cooccurrence[*c1 * num_colors + i];
+ *c2 = i;
+ }
+ }
+ assert(*c1 != *c2);
+}
+
+// Builds the cooccurrence matrix
+static WebPEncodingError CoOccurrenceBuild(const WebPPicture* const pic,
+ const uint32_t* const palette,
+ uint32_t num_colors,
+ uint32_t* cooccurrence) {
+ uint32_t *lines, *line_top, *line_current, *line_tmp;
+ int x, y;
+ const uint32_t* src = pic->argb;
+ uint32_t prev_pix = ~src[0];
+ uint32_t prev_idx = 0u;
+ uint32_t idx_map[MAX_PALETTE_SIZE] = {0};
+ uint32_t palette_sorted[MAX_PALETTE_SIZE];
+ lines = (uint32_t*)WebPSafeMalloc(2 * pic->width, sizeof(*lines));
+ if (lines == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+ line_top = &lines[0];
+ line_current = &lines[pic->width];
+ PrepareMapToPalette(palette, num_colors, palette_sorted, idx_map);
+ for (y = 0; y < pic->height; ++y) {
+ for (x = 0; x < pic->width; ++x) {
+ const uint32_t pix = src[x];
+ if (pix != prev_pix) {
+ prev_idx = idx_map[SearchColorNoIdx(palette_sorted, pix, num_colors)];
+ prev_pix = pix;
+ }
+ line_current[x] = prev_idx;
+ // 4-connectivity is what works best as mentioned in "On the relation
+ // between Memon's and the modified Zeng's palette reordering methods".
+ if (x > 0 && prev_idx != line_current[x - 1]) {
+ const uint32_t left_idx = line_current[x - 1];
+ ++cooccurrence[prev_idx * num_colors + left_idx];
+ ++cooccurrence[left_idx * num_colors + prev_idx];
+ }
+ if (y > 0 && prev_idx != line_top[x]) {
+ const uint32_t top_idx = line_top[x];
+ ++cooccurrence[prev_idx * num_colors + top_idx];
+ ++cooccurrence[top_idx * num_colors + prev_idx];
+ }
+ }
+ line_tmp = line_top;
+ line_top = line_current;
+ line_current = line_tmp;
+ src += pic->argb_stride;
+ }
+ WebPSafeFree(lines);
+ return VP8_ENC_OK;
+}
+
+struct Sum {
+ uint8_t index;
+ uint32_t sum;
+};
+
+// Implements the modified Zeng method from "A Survey on Palette Reordering
+// Methods for Improving the Compression of Color-Indexed Images" by Armando J.
+// Pinho and Antonio J. R. Neves.
+static WebPEncodingError PaletteSortModifiedZeng(
+ const WebPPicture* const pic, const uint32_t* const palette_sorted,
+ uint32_t num_colors, uint32_t* const palette) {
+ uint32_t i, j, ind;
+ uint8_t remapping[MAX_PALETTE_SIZE];
+ uint32_t* cooccurrence;
+ struct Sum sums[MAX_PALETTE_SIZE];
+ uint32_t first, last;
+ uint32_t num_sums;
+ // TODO(vrabaud) check whether one color images should use palette or not.
+ if (num_colors <= 1) return VP8_ENC_OK;
+ // Build the co-occurrence matrix.
+ cooccurrence =
+ (uint32_t*)WebPSafeCalloc(num_colors * num_colors, sizeof(*cooccurrence));
+ if (cooccurrence == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+ if (CoOccurrenceBuild(pic, palette_sorted, num_colors, cooccurrence) !=
+ VP8_ENC_OK) {
+ WebPSafeFree(cooccurrence);
+ return VP8_ENC_ERROR_OUT_OF_MEMORY;
+ }
+
+ // Initialize the mapping list with the two best indices.
+ CoOccurrenceFindMax(cooccurrence, num_colors, &remapping[0], &remapping[1]);
+
+ // We need to append and prepend to the list of remapping. To this end, we
+ // actually define the next start/end of the list as indices in a vector (with
+ // a wrap around when the end is reached).
+ first = 0;
+ last = 1;
+ num_sums = num_colors - 2; // -2 because we know the first two values
+ if (num_sums > 0) {
+ // Initialize the sums with the first two remappings and find the best one
+ struct Sum* best_sum = &sums[0];
+ best_sum->index = 0u;
+ best_sum->sum = 0u;
+ for (i = 0, j = 0; i < num_colors; ++i) {
+ if (i == remapping[0] || i == remapping[1]) continue;
+ sums[j].index = i;
+ sums[j].sum = cooccurrence[i * num_colors + remapping[0]] +
+ cooccurrence[i * num_colors + remapping[1]];
+ if (sums[j].sum > best_sum->sum) best_sum = &sums[j];
+ ++j;
+ }
+
+ while (num_sums > 0) {
+ const uint8_t best_index = best_sum->index;
+ // Compute delta to know if we need to prepend or append the best index.
+ int32_t delta = 0;
+ const int32_t n = num_colors - num_sums;
+ for (ind = first, j = 0; (ind + j) % num_colors != last + 1; ++j) {
+ const uint16_t l_j = remapping[(ind + j) % num_colors];
+ delta += (n - 1 - 2 * (int32_t)j) *
+ (int32_t)cooccurrence[best_index * num_colors + l_j];
+ }
+ if (delta > 0) {
+ first = (first == 0) ? num_colors - 1 : first - 1;
+ remapping[first] = best_index;
+ } else {
+ ++last;
+ remapping[last] = best_index;
+ }
+ // Remove best_sum from sums.
+ *best_sum = sums[num_sums - 1];
+ --num_sums;
+ // Update all the sums and find the best one.
+ best_sum = &sums[0];
+ for (i = 0; i < num_sums; ++i) {
+ sums[i].sum += cooccurrence[best_index * num_colors + sums[i].index];
+ if (sums[i].sum > best_sum->sum) best_sum = &sums[i];
+ }
+ }
+ }
+ assert((last + 1) % num_colors == first);
+ WebPSafeFree(cooccurrence);
+
+ // Re-map the palette.
+ for (i = 0; i < num_colors; ++i) {
+ palette[i] = palette_sorted[remapping[(first + i) % num_colors]];
+ }
+ return VP8_ENC_OK;
+}
+
+// -----------------------------------------------------------------------------
+// Palette
+
+// These five modes are evaluated and their respective entropy is computed.
+typedef enum {
+ kDirect = 0,
+ kSpatial = 1,
+ kSubGreen = 2,
+ kSpatialSubGreen = 3,
+ kPalette = 4,
+ kPaletteAndSpatial = 5,
+ kNumEntropyIx = 6
+} EntropyIx;
+
+typedef enum {
+ kSortedDefault = 0,
+ kMinimizeDelta = 1,
+ kModifiedZeng = 2,
+ kUnusedPalette = 3,
+} PaletteSorting;
+
+typedef enum {
+ kHistoAlpha = 0,
+ kHistoAlphaPred,
+ kHistoGreen,
+ kHistoGreenPred,
+ kHistoRed,
+ kHistoRedPred,
+ kHistoBlue,
+ kHistoBluePred,
+ kHistoRedSubGreen,
+ kHistoRedPredSubGreen,
+ kHistoBlueSubGreen,
+ kHistoBluePredSubGreen,
+ kHistoPalette,
+ kHistoTotal // Must be last.
+} HistoIx;
+
+static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) {
+ const int green = p >> 8; // The upper bits are masked away later.
+ ++r[((p >> 16) - green) & 0xff];
+ ++b[((p >> 0) - green) & 0xff];
+}
+
+static void AddSingle(uint32_t p,
+ uint32_t* const a, uint32_t* const r,
+ uint32_t* const g, uint32_t* const b) {
+ ++a[(p >> 24) & 0xff];
+ ++r[(p >> 16) & 0xff];
+ ++g[(p >> 8) & 0xff];
+ ++b[(p >> 0) & 0xff];
+}
+
+static WEBP_INLINE uint32_t HashPix(uint32_t pix) {
+ // Note that masking with 0xffffffffu is for preventing an
+ // 'unsigned int overflow' warning. Doesn't impact the compiled code.
+ return ((((uint64_t)pix + (pix >> 19)) * 0x39c5fba7ull) & 0xffffffffu) >> 24;
+}
+
+static int AnalyzeEntropy(const uint32_t* argb,
+ int width, int height, int argb_stride,
+ int use_palette,
+ int palette_size, int transform_bits,
+ EntropyIx* const min_entropy_ix,
+ int* const red_and_blue_always_zero) {
+ // Allocate histogram set with cache_bits = 0.
+ uint32_t* histo;
+
+ if (use_palette && palette_size <= 16) {
+ // In the case of small palettes, we pack 2, 4 or 8 pixels together. In
+ // practice, small palettes are better than any other transform.
+ *min_entropy_ix = kPalette;
+ *red_and_blue_always_zero = 1;
+ return 1;
+ }
+ histo = (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256);
+ if (histo != NULL) {
+ int i, x, y;
+ const uint32_t* prev_row = NULL;
+ const uint32_t* curr_row = argb;
+ uint32_t pix_prev = argb[0]; // Skip the first pixel.
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; ++x) {
+ const uint32_t pix = curr_row[x];
+ const uint32_t pix_diff = VP8LSubPixels(pix, pix_prev);
+ pix_prev = pix;
+ if ((pix_diff == 0) || (prev_row != NULL && pix == prev_row[x])) {
+ continue;
+ }
+ AddSingle(pix,
+ &histo[kHistoAlpha * 256],
+ &histo[kHistoRed * 256],
+ &histo[kHistoGreen * 256],
+ &histo[kHistoBlue * 256]);
+ AddSingle(pix_diff,
+ &histo[kHistoAlphaPred * 256],
+ &histo[kHistoRedPred * 256],
+ &histo[kHistoGreenPred * 256],
+ &histo[kHistoBluePred * 256]);
+ AddSingleSubGreen(pix,
+ &histo[kHistoRedSubGreen * 256],
+ &histo[kHistoBlueSubGreen * 256]);
+ AddSingleSubGreen(pix_diff,
+ &histo[kHistoRedPredSubGreen * 256],
+ &histo[kHistoBluePredSubGreen * 256]);
+ {
+ // Approximate the palette by the entropy of the multiplicative hash.
+ const uint32_t hash = HashPix(pix);
+ ++histo[kHistoPalette * 256 + hash];
+ }
+ }
+ prev_row = curr_row;
+ curr_row += argb_stride;
+ }
+ {
+ double entropy_comp[kHistoTotal];
+ double entropy[kNumEntropyIx];
+ int k;
+ int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
+ int j;
+ // Let's add one zero to the predicted histograms. The zeros are removed
+ // too efficiently by the pix_diff == 0 comparison, at least one of the
+ // zeros is likely to exist.
+ ++histo[kHistoRedPredSubGreen * 256];
+ ++histo[kHistoBluePredSubGreen * 256];
+ ++histo[kHistoRedPred * 256];
+ ++histo[kHistoGreenPred * 256];
+ ++histo[kHistoBluePred * 256];
+ ++histo[kHistoAlphaPred * 256];
+
+ for (j = 0; j < kHistoTotal; ++j) {
+ entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256);
+ }
+ entropy[kDirect] = entropy_comp[kHistoAlpha] +
+ entropy_comp[kHistoRed] +
+ entropy_comp[kHistoGreen] +
+ entropy_comp[kHistoBlue];
+ entropy[kSpatial] = entropy_comp[kHistoAlphaPred] +
+ entropy_comp[kHistoRedPred] +
+ entropy_comp[kHistoGreenPred] +
+ entropy_comp[kHistoBluePred];
+ entropy[kSubGreen] = entropy_comp[kHistoAlpha] +
+ entropy_comp[kHistoRedSubGreen] +
+ entropy_comp[kHistoGreen] +
+ entropy_comp[kHistoBlueSubGreen];
+ entropy[kSpatialSubGreen] = entropy_comp[kHistoAlphaPred] +
+ entropy_comp[kHistoRedPredSubGreen] +
+ entropy_comp[kHistoGreenPred] +
+ entropy_comp[kHistoBluePredSubGreen];
+ entropy[kPalette] = entropy_comp[kHistoPalette];
+
+ // When including transforms, there is an overhead in bits from
+ // storing them. This overhead is small but matters for small images.
+ // For spatial, there are 14 transformations.
+ entropy[kSpatial] += VP8LSubSampleSize(width, transform_bits) *
+ VP8LSubSampleSize(height, transform_bits) *
+ VP8LFastLog2(14);
+ // For color transforms: 24 as only 3 channels are considered in a
+ // ColorTransformElement.
+ entropy[kSpatialSubGreen] += VP8LSubSampleSize(width, transform_bits) *
+ VP8LSubSampleSize(height, transform_bits) *
+ VP8LFastLog2(24);
+ // For palettes, add the cost of storing the palette.
+ // We empirically estimate the cost of a compressed entry as 8 bits.
+ // The palette is differential-coded when compressed hence a much
+ // lower cost than sizeof(uint32_t)*8.
+ entropy[kPalette] += palette_size * 8;
+
+ *min_entropy_ix = kDirect;
+ for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
+ if (entropy[*min_entropy_ix] > entropy[k]) {
+ *min_entropy_ix = (EntropyIx)k;
+ }
+ }
+ assert((int)*min_entropy_ix <= last_mode_to_analyze);
+ *red_and_blue_always_zero = 1;
+ // Let's check if the histogram of the chosen entropy mode has
+ // non-zero red and blue values. If all are zero, we can later skip
+ // the cross color optimization.
+ {
+ static const uint8_t kHistoPairs[5][2] = {
+ { kHistoRed, kHistoBlue },
+ { kHistoRedPred, kHistoBluePred },
+ { kHistoRedSubGreen, kHistoBlueSubGreen },
+ { kHistoRedPredSubGreen, kHistoBluePredSubGreen },
+ { kHistoRed, kHistoBlue }
+ };
+ const uint32_t* const red_histo =
+ &histo[256 * kHistoPairs[*min_entropy_ix][0]];
+ const uint32_t* const blue_histo =
+ &histo[256 * kHistoPairs[*min_entropy_ix][1]];
+ for (i = 1; i < 256; ++i) {
+ if ((red_histo[i] | blue_histo[i]) != 0) {
+ *red_and_blue_always_zero = 0;
+ break;
+ }
+ }
+ }
+ }
+ WebPSafeFree(histo);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static int GetHistoBits(int method, int use_palette, int width, int height) {
+ // Make tile size a function of encoding method (Range: 0 to 6).
+ int histo_bits = (use_palette ? 9 : 7) - method;
+ while (1) {
+ const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+ VP8LSubSampleSize(height, histo_bits);
+ if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
+ ++histo_bits;
+ }
+ return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
+ (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
+}
+
+static int GetTransformBits(int method, int histo_bits) {
+ const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
+ const int res =
+ (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+ assert(res <= MAX_TRANSFORM_BITS);
+ return res;
+}
+
+// Set of parameters to be used in each iteration of the cruncher.
+#define CRUNCH_SUBCONFIGS_MAX 2
+typedef struct {
+ int lz77_;
+ int do_no_cache_;
+} CrunchSubConfig;
+typedef struct {
+ int entropy_idx_;
+ PaletteSorting palette_sorting_type_;
+ CrunchSubConfig sub_configs_[CRUNCH_SUBCONFIGS_MAX];
+ int sub_configs_size_;
+} CrunchConfig;
+
+// +2 because we add a palette sorting configuration for kPalette and
+// kPaletteAndSpatial.
+#define CRUNCH_CONFIGS_MAX (kNumEntropyIx + 2)
+
+static int EncoderAnalyze(VP8LEncoder* const enc,
+ CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX],
+ int* const crunch_configs_size,
+ int* const red_and_blue_always_zero) {
+ const WebPPicture* const pic = enc->pic_;
+ const int width = pic->width;
+ const int height = pic->height;
+ const WebPConfig* const config = enc->config_;
+ const int method = config->method;
+ const int low_effort = (config->method == 0);
+ int i;
+ int use_palette;
+ int n_lz77s;
+ // If set to 0, analyze the cache with the computed cache value. If 1, also
+ // analyze with no-cache.
+ int do_no_cache = 0;
+ assert(pic != NULL && pic->argb != NULL);
+
+ // Check whether a palette is possible.
+ enc->palette_size_ = WebPGetColorPalette(pic, enc->palette_sorted_);
+ use_palette = (enc->palette_size_ <= MAX_PALETTE_SIZE);
+ if (!use_palette) {
+ enc->palette_size_ = 0;
+ } else {
+ qsort(enc->palette_sorted_, enc->palette_size_,
+ sizeof(*enc->palette_sorted_), PaletteCompareColorsForQsort);
+ }
+
+ // Empirical bit sizes.
+ enc->histo_bits_ = GetHistoBits(method, use_palette,
+ pic->width, pic->height);
+ enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
+
+ if (low_effort) {
+ // AnalyzeEntropy is somewhat slow.
+ crunch_configs[0].entropy_idx_ = use_palette ? kPalette : kSpatialSubGreen;
+ crunch_configs[0].palette_sorting_type_ =
+ use_palette ? kSortedDefault : kUnusedPalette;
+ n_lz77s = 1;
+ *crunch_configs_size = 1;
+ } else {
+ EntropyIx min_entropy_ix;
+ // Try out multiple LZ77 on images with few colors.
+ n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1;
+ if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette,
+ enc->palette_size_, enc->transform_bits_,
+ &min_entropy_ix, red_and_blue_always_zero)) {
+ return 0;
+ }
+ if (method == 6 && config->quality == 100) {
+ do_no_cache = 1;
+ // Go brute force on all transforms.
+ *crunch_configs_size = 0;
+ for (i = 0; i < kNumEntropyIx; ++i) {
+ // We can only apply kPalette or kPaletteAndSpatial if we can indeed use
+ // a palette.
+ if ((i != kPalette && i != kPaletteAndSpatial) || use_palette) {
+ assert(*crunch_configs_size < CRUNCH_CONFIGS_MAX);
+ crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+ if (use_palette && (i == kPalette || i == kPaletteAndSpatial)) {
+ crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+ kMinimizeDelta;
+ ++*crunch_configs_size;
+ // Also add modified Zeng's method.
+ crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+ crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+ kModifiedZeng;
+ } else {
+ crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+ kUnusedPalette;
+ }
+ ++*crunch_configs_size;
+ }
+ }
+ } else {
+ // Only choose the guessed best transform.
+ *crunch_configs_size = 1;
+ crunch_configs[0].entropy_idx_ = min_entropy_ix;
+ crunch_configs[0].palette_sorting_type_ =
+ use_palette ? kMinimizeDelta : kUnusedPalette;
+ if (config->quality >= 75 && method == 5) {
+ // Test with and without color cache.
+ do_no_cache = 1;
+ // If we have a palette, also check in combination with spatial.
+ if (min_entropy_ix == kPalette) {
+ *crunch_configs_size = 2;
+ crunch_configs[1].entropy_idx_ = kPaletteAndSpatial;
+ crunch_configs[1].palette_sorting_type_ = kMinimizeDelta;
+ }
+ }
+ }
+ }
+ // Fill in the different LZ77s.
+ assert(n_lz77s <= CRUNCH_SUBCONFIGS_MAX);
+ for (i = 0; i < *crunch_configs_size; ++i) {
+ int j;
+ for (j = 0; j < n_lz77s; ++j) {
+ assert(j < CRUNCH_SUBCONFIGS_MAX);
+ crunch_configs[i].sub_configs_[j].lz77_ =
+ (j == 0) ? kLZ77Standard | kLZ77RLE : kLZ77Box;
+ crunch_configs[i].sub_configs_[j].do_no_cache_ = do_no_cache;
+ }
+ crunch_configs[i].sub_configs_size_ = n_lz77s;
+ }
+ return 1;
+}
+
+static int EncoderInit(VP8LEncoder* const enc) {
+ const WebPPicture* const pic = enc->pic_;
+ const int width = pic->width;
+ const int height = pic->height;
+ const int pix_cnt = width * height;
+ // we round the block size up, so we're guaranteed to have
+ // at most MAX_REFS_BLOCK_PER_IMAGE blocks used:
+ const int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
+ int i;
+ if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
+
+ for (i = 0; i < 4; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size);
+
+ return 1;
+}
+
+// Returns false in case of memory error.
+static int GetHuffBitLengthsAndCodes(
+ const VP8LHistogramSet* const histogram_image,
+ HuffmanTreeCode* const huffman_codes) {
+ int i, k;
+ int ok = 0;
+ uint64_t total_length_size = 0;
+ uint8_t* mem_buf = NULL;
+ const int histogram_image_size = histogram_image->size;
+ int max_num_symbols = 0;
+ uint8_t* buf_rle = NULL;
+ HuffmanTree* huff_tree = NULL;
+
+ // Iterate over all histograms and get the aggregate number of codes used.
+ for (i = 0; i < histogram_image_size; ++i) {
+ const VP8LHistogram* const histo = histogram_image->histograms[i];
+ HuffmanTreeCode* const codes = &huffman_codes[5 * i];
+ assert(histo != NULL);
+ for (k = 0; k < 5; ++k) {
+ const int num_symbols =
+ (k == 0) ? VP8LHistogramNumCodes(histo->palette_code_bits_) :
+ (k == 4) ? NUM_DISTANCE_CODES : 256;
+ codes[k].num_symbols = num_symbols;
+ total_length_size += num_symbols;
+ }
+ }
+
+ // Allocate and Set Huffman codes.
+ {
+ uint16_t* codes;
+ uint8_t* lengths;
+ mem_buf = (uint8_t*)WebPSafeCalloc(total_length_size,
+ sizeof(*lengths) + sizeof(*codes));
+ if (mem_buf == NULL) goto End;
+
+ codes = (uint16_t*)mem_buf;
+ lengths = (uint8_t*)&codes[total_length_size];
+ for (i = 0; i < 5 * histogram_image_size; ++i) {
+ const int bit_length = huffman_codes[i].num_symbols;
+ huffman_codes[i].codes = codes;
+ huffman_codes[i].code_lengths = lengths;
+ codes += bit_length;
+ lengths += bit_length;
+ if (max_num_symbols < bit_length) {
+ max_num_symbols = bit_length;
+ }
+ }
+ }
+
+ buf_rle = (uint8_t*)WebPSafeMalloc(1ULL, max_num_symbols);
+ huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * max_num_symbols,
+ sizeof(*huff_tree));
+ if (buf_rle == NULL || huff_tree == NULL) goto End;
+
+ // Create Huffman trees.
+ for (i = 0; i < histogram_image_size; ++i) {
+ HuffmanTreeCode* const codes = &huffman_codes[5 * i];
+ VP8LHistogram* const histo = histogram_image->histograms[i];
+ VP8LCreateHuffmanTree(histo->literal_, 15, buf_rle, huff_tree, codes + 0);
+ VP8LCreateHuffmanTree(histo->red_, 15, buf_rle, huff_tree, codes + 1);
+ VP8LCreateHuffmanTree(histo->blue_, 15, buf_rle, huff_tree, codes + 2);
+ VP8LCreateHuffmanTree(histo->alpha_, 15, buf_rle, huff_tree, codes + 3);
+ VP8LCreateHuffmanTree(histo->distance_, 15, buf_rle, huff_tree, codes + 4);
+ }
+ ok = 1;
+ End:
+ WebPSafeFree(huff_tree);
+ WebPSafeFree(buf_rle);
+ if (!ok) {
+ WebPSafeFree(mem_buf);
+ memset(huffman_codes, 0, 5 * histogram_image_size * sizeof(*huffman_codes));
+ }
+ return ok;
+}
+
+static void StoreHuffmanTreeOfHuffmanTreeToBitMask(
+ VP8LBitWriter* const bw, const uint8_t* code_length_bitdepth) {
+ // RFC 1951 will calm you down if you are worried about this funny sequence.
+ // This sequence is tuned from that, but more weighted for lower symbol count,
+ // and more spiking histograms.
+ static const uint8_t kStorageOrder[CODE_LENGTH_CODES] = {
+ 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ };
+ int i;
+ // Throw away trailing zeros:
+ int codes_to_store = CODE_LENGTH_CODES;
+ for (; codes_to_store > 4; --codes_to_store) {
+ if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
+ break;
+ }
+ }
+ VP8LPutBits(bw, codes_to_store - 4, 4);
+ for (i = 0; i < codes_to_store; ++i) {
+ VP8LPutBits(bw, code_length_bitdepth[kStorageOrder[i]], 3);
+ }
+}
+
+static void ClearHuffmanTreeIfOnlyOneSymbol(
+ HuffmanTreeCode* const huffman_code) {
+ int k;
+ int count = 0;
+ for (k = 0; k < huffman_code->num_symbols; ++k) {
+ if (huffman_code->code_lengths[k] != 0) {
+ ++count;
+ if (count > 1) return;
+ }
+ }
+ for (k = 0; k < huffman_code->num_symbols; ++k) {
+ huffman_code->code_lengths[k] = 0;
+ huffman_code->codes[k] = 0;
+ }
+}
+
+static void StoreHuffmanTreeToBitMask(
+ VP8LBitWriter* const bw,
+ const HuffmanTreeToken* const tokens, const int num_tokens,
+ const HuffmanTreeCode* const huffman_code) {
+ int i;
+ for (i = 0; i < num_tokens; ++i) {
+ const int ix = tokens[i].code;
+ const int extra_bits = tokens[i].extra_bits;
+ VP8LPutBits(bw, huffman_code->codes[ix], huffman_code->code_lengths[ix]);
+ switch (ix) {
+ case 16:
+ VP8LPutBits(bw, extra_bits, 2);
+ break;
+ case 17:
+ VP8LPutBits(bw, extra_bits, 3);
+ break;
+ case 18:
+ VP8LPutBits(bw, extra_bits, 7);
+ break;
+ }
+ }
+}
+
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
+ HuffmanTree* const huff_tree,
+ HuffmanTreeToken* const tokens,
+ const HuffmanTreeCode* const tree) {
+ uint8_t code_length_bitdepth[CODE_LENGTH_CODES] = { 0 };
+ uint16_t code_length_bitdepth_symbols[CODE_LENGTH_CODES] = { 0 };
+ const int max_tokens = tree->num_symbols;
+ int num_tokens;
+ HuffmanTreeCode huffman_code;
+ huffman_code.num_symbols = CODE_LENGTH_CODES;
+ huffman_code.code_lengths = code_length_bitdepth;
+ huffman_code.codes = code_length_bitdepth_symbols;
+
+ VP8LPutBits(bw, 0, 1);
+ num_tokens = VP8LCreateCompressedHuffmanTree(tree, tokens, max_tokens);
+ {
+ uint32_t histogram[CODE_LENGTH_CODES] = { 0 };
+ uint8_t buf_rle[CODE_LENGTH_CODES] = { 0 };
+ int i;
+ for (i = 0; i < num_tokens; ++i) {
+ ++histogram[tokens[i].code];
+ }
+
+ VP8LCreateHuffmanTree(histogram, 7, buf_rle, huff_tree, &huffman_code);
+ }
+
+ StoreHuffmanTreeOfHuffmanTreeToBitMask(bw, code_length_bitdepth);
+ ClearHuffmanTreeIfOnlyOneSymbol(&huffman_code);
+ {
+ int trailing_zero_bits = 0;
+ int trimmed_length = num_tokens;
+ int write_trimmed_length;
+ int length;
+ int i = num_tokens;
+ while (i-- > 0) {
+ const int ix = tokens[i].code;
+ if (ix == 0 || ix == 17 || ix == 18) {
+ --trimmed_length; // discount trailing zeros
+ trailing_zero_bits += code_length_bitdepth[ix];
+ if (ix == 17) {
+ trailing_zero_bits += 3;
+ } else if (ix == 18) {
+ trailing_zero_bits += 7;
+ }
+ } else {
+ break;
+ }
+ }
+ write_trimmed_length = (trimmed_length > 1 && trailing_zero_bits > 12);
+ length = write_trimmed_length ? trimmed_length : num_tokens;
+ VP8LPutBits(bw, write_trimmed_length, 1);
+ if (write_trimmed_length) {
+ if (trimmed_length == 2) {
+ VP8LPutBits(bw, 0, 3 + 2); // nbitpairs=1, trimmed_length=2
+ } else {
+ const int nbits = BitsLog2Floor(trimmed_length - 2);
+ const int nbitpairs = nbits / 2 + 1;
+ assert(trimmed_length > 2);
+ assert(nbitpairs - 1 < 8);
+ VP8LPutBits(bw, nbitpairs - 1, 3);
+ VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2);
+ }
+ }
+ StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
+ }
+}
+
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreHuffmanCode(VP8LBitWriter* const bw,
+ HuffmanTree* const huff_tree,
+ HuffmanTreeToken* const tokens,
+ const HuffmanTreeCode* const huffman_code) {
+ int i;
+ int count = 0;
+ int symbols[2] = { 0, 0 };
+ const int kMaxBits = 8;
+ const int kMaxSymbol = 1 << kMaxBits;
+
+ // Check whether it's a small tree.
+ for (i = 0; i < huffman_code->num_symbols && count < 3; ++i) {
+ if (huffman_code->code_lengths[i] != 0) {
+ if (count < 2) symbols[count] = i;
+ ++count;
+ }
+ }
+
+ if (count == 0) { // emit minimal tree for empty cases
+ // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
+ VP8LPutBits(bw, 0x01, 4);
+ } else if (count <= 2 && symbols[0] < kMaxSymbol && symbols[1] < kMaxSymbol) {
+ VP8LPutBits(bw, 1, 1); // Small tree marker to encode 1 or 2 symbols.
+ VP8LPutBits(bw, count - 1, 1);
+ if (symbols[0] <= 1) {
+ VP8LPutBits(bw, 0, 1); // Code bit for small (1 bit) symbol value.
+ VP8LPutBits(bw, symbols[0], 1);
+ } else {
+ VP8LPutBits(bw, 1, 1);
+ VP8LPutBits(bw, symbols[0], 8);
+ }
+ if (count == 2) {
+ VP8LPutBits(bw, symbols[1], 8);
+ }
+ } else {
+ StoreFullHuffmanCode(bw, huff_tree, tokens, huffman_code);
+ }
+}
+
+static WEBP_INLINE void WriteHuffmanCode(VP8LBitWriter* const bw,
+ const HuffmanTreeCode* const code,
+ int code_index) {
+ const int depth = code->code_lengths[code_index];
+ const int symbol = code->codes[code_index];
+ VP8LPutBits(bw, symbol, depth);
+}
+
+static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
+ VP8LBitWriter* const bw,
+ const HuffmanTreeCode* const code,
+ int code_index,
+ int bits,
+ int n_bits) {
+ const int depth = code->code_lengths[code_index];
+ const int symbol = code->codes[code_index];
+ VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
+}
+
+static WebPEncodingError StoreImageToBitMask(
+ VP8LBitWriter* const bw, int width, int histo_bits,
+ const VP8LBackwardRefs* const refs,
+ const uint16_t* histogram_symbols,
+ const HuffmanTreeCode* const huffman_codes) {
+ const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
+ const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
+ // x and y trace the position in the image.
+ int x = 0;
+ int y = 0;
+ int tile_x = x & tile_mask;
+ int tile_y = y & tile_mask;
+ int histogram_ix = histogram_symbols[0];
+ const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix;
+ VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+ while (VP8LRefsCursorOk(&c)) {
+ const PixOrCopy* const v = c.cur_pos;
+ if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) {
+ tile_x = x & tile_mask;
+ tile_y = y & tile_mask;
+ histogram_ix = histogram_symbols[(y >> histo_bits) * histo_xsize +
+ (x >> histo_bits)];
+ codes = huffman_codes + 5 * histogram_ix;
+ }
+ if (PixOrCopyIsLiteral(v)) {
+ static const uint8_t order[] = { 1, 2, 0, 3 };
+ int k;
+ for (k = 0; k < 4; ++k) {
+ const int code = PixOrCopyLiteral(v, order[k]);
+ WriteHuffmanCode(bw, codes + k, code);
+ }
+ } else if (PixOrCopyIsCacheIdx(v)) {
+ const int code = PixOrCopyCacheIdx(v);
+ const int literal_ix = 256 + NUM_LENGTH_CODES + code;
+ WriteHuffmanCode(bw, codes, literal_ix);
+ } else {
+ int bits, n_bits;
+ int code;
+
+ const int distance = PixOrCopyDistance(v);
+ VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
+ WriteHuffmanCodeWithExtraBits(bw, codes, 256 + code, bits, n_bits);
+
+ // Don't write the distance with the extra bits code since
+ // the distance can be up to 18 bits of extra bits, and the prefix
+ // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
+ VP8LPrefixEncode(distance, &code, &n_bits, &bits);
+ WriteHuffmanCode(bw, codes + 4, code);
+ VP8LPutBits(bw, bits, n_bits);
+ }
+ x += PixOrCopyLength(v);
+ while (x >= width) {
+ x -= width;
+ ++y;
+ }
+ VP8LRefsCursorNext(&c);
+ }
+ return bw->error_ ? VP8_ENC_ERROR_OUT_OF_MEMORY : VP8_ENC_OK;
+}
+
+// Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31
+static WebPEncodingError EncodeImageNoHuffman(
+ VP8LBitWriter* const bw, const uint32_t* const argb,
+ VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_array,
+ int width, int height, int quality, int low_effort) {
+ int i;
+ int max_tokens = 0;
+ WebPEncodingError err = VP8_ENC_OK;
+ VP8LBackwardRefs* refs;
+ HuffmanTreeToken* tokens = NULL;
+ HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
+ const uint16_t histogram_symbols[1] = { 0 }; // only one tree, one symbol
+ int cache_bits = 0;
+ VP8LHistogramSet* histogram_image = NULL;
+ HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+ 3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+ if (huff_tree == NULL) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+
+ // Calculate backward references from ARGB image.
+ if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+ low_effort)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+ err = VP8LGetBackwardReferences(
+ width, height, argb, quality, /*low_effort=*/0, kLZ77Standard | kLZ77RLE,
+ cache_bits, /*do_no_cache=*/0, hash_chain, refs_array, &cache_bits);
+ if (err != VP8_ENC_OK) goto Error;
+ refs = &refs_array[0];
+ histogram_image = VP8LAllocateHistogramSet(1, cache_bits);
+ if (histogram_image == NULL) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+ VP8LHistogramSetClear(histogram_image);
+
+ // Build histogram image and symbols from backward references.
+ VP8LHistogramStoreRefs(refs, histogram_image->histograms[0]);
+
+ // Create Huffman bit lengths and codes for each histogram image.
+ assert(histogram_image->size == 1);
+ if (!GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+
+ // No color cache, no Huffman image.
+ VP8LPutBits(bw, 0, 1);
+
+ // Find maximum number of symbols for the huffman tree-set.
+ for (i = 0; i < 5; ++i) {
+ HuffmanTreeCode* const codes = &huffman_codes[i];
+ if (max_tokens < codes->num_symbols) {
+ max_tokens = codes->num_symbols;
+ }
+ }
+
+ tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+ if (tokens == NULL) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+
+ // Store Huffman codes.
+ for (i = 0; i < 5; ++i) {
+ HuffmanTreeCode* const codes = &huffman_codes[i];
+ StoreHuffmanCode(bw, huff_tree, tokens, codes);
+ ClearHuffmanTreeIfOnlyOneSymbol(codes);
+ }
+
+ // Store actual literals.
+ err = StoreImageToBitMask(bw, width, 0, refs, histogram_symbols,
+ huffman_codes);
+
+ Error:
+ WebPSafeFree(tokens);
+ WebPSafeFree(huff_tree);
+ VP8LFreeHistogramSet(histogram_image);
+ WebPSafeFree(huffman_codes[0].codes);
+ return err;
+}
+
+static WebPEncodingError EncodeImageInternal(
+ VP8LBitWriter* const bw, const uint32_t* const argb,
+ VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
+ int height, int quality, int low_effort, int use_cache,
+ const CrunchConfig* const config, int* cache_bits, int histogram_bits,
+ size_t init_byte_position, int* const hdr_size, int* const data_size) {
+ WebPEncodingError err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ const uint32_t histogram_image_xysize =
+ VP8LSubSampleSize(width, histogram_bits) *
+ VP8LSubSampleSize(height, histogram_bits);
+ VP8LHistogramSet* histogram_image = NULL;
+ VP8LHistogram* tmp_histo = NULL;
+ int histogram_image_size = 0;
+ size_t bit_array_size = 0;
+ HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+ 3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+ HuffmanTreeToken* tokens = NULL;
+ HuffmanTreeCode* huffman_codes = NULL;
+ uint16_t* const histogram_symbols =
+ (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
+ sizeof(*histogram_symbols));
+ int sub_configs_idx;
+ int cache_bits_init, write_histogram_image;
+ VP8LBitWriter bw_init = *bw, bw_best;
+ int hdr_size_tmp;
+ VP8LHashChain hash_chain_histogram; // histogram image hash chain
+ size_t bw_size_best = ~(size_t)0;
+ assert(histogram_bits >= MIN_HUFFMAN_BITS);
+ assert(histogram_bits <= MAX_HUFFMAN_BITS);
+ assert(hdr_size != NULL);
+ assert(data_size != NULL);
+
+ // Make sure we can allocate the different objects.
+ memset(&hash_chain_histogram, 0, sizeof(hash_chain_histogram));
+ if (huff_tree == NULL || histogram_symbols == NULL ||
+ !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize) ||
+ !VP8LHashChainFill(hash_chain, quality, argb, width, height,
+ low_effort)) {
+ goto Error;
+ }
+ if (use_cache) {
+ // If the value is different from zero, it has been set during the
+ // palette analysis.
+ cache_bits_init = (*cache_bits == 0) ? MAX_COLOR_CACHE_BITS : *cache_bits;
+ } else {
+ cache_bits_init = 0;
+ }
+ // If several iterations will happen, clone into bw_best.
+ if (!VP8LBitWriterInit(&bw_best, 0) ||
+ ((config->sub_configs_size_ > 1 ||
+ config->sub_configs_[0].do_no_cache_) &&
+ !VP8LBitWriterClone(bw, &bw_best))) {
+ goto Error;
+ }
+ for (sub_configs_idx = 0; sub_configs_idx < config->sub_configs_size_;
+ ++sub_configs_idx) {
+ const CrunchSubConfig* const sub_config =
+ &config->sub_configs_[sub_configs_idx];
+ int cache_bits_best, i_cache;
+ err = VP8LGetBackwardReferences(width, height, argb, quality, low_effort,
+ sub_config->lz77_, cache_bits_init,
+ sub_config->do_no_cache_, hash_chain,
+ &refs_array[0], &cache_bits_best);
+ if (err != VP8_ENC_OK) goto Error;
+
+ for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
+ const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
+ // Speed-up: no need to study the no-cache case if it was already studied
+ // in i_cache == 0.
+ if (i_cache == 1 && cache_bits_best == 0) break;
+
+ // Reset the bit writer for this iteration.
+ VP8LBitWriterReset(&bw_init, bw);
+
+ // Build histogram image and symbols from backward references.
+ histogram_image =
+ VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits_tmp);
+ tmp_histo = VP8LAllocateHistogram(cache_bits_tmp);
+ if (histogram_image == NULL || tmp_histo == NULL ||
+ !VP8LGetHistoImageSymbols(width, height, &refs_array[i_cache],
+ quality, low_effort, histogram_bits,
+ cache_bits_tmp, histogram_image, tmp_histo,
+ histogram_symbols)) {
+ goto Error;
+ }
+ // Create Huffman bit lengths and codes for each histogram image.
+ histogram_image_size = histogram_image->size;
+ bit_array_size = 5 * histogram_image_size;
+ huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
+ sizeof(*huffman_codes));
+ // Note: some histogram_image entries may point to tmp_histos[], so the
+ // latter need to outlive the following call to
+ // GetHuffBitLengthsAndCodes().
+ if (huffman_codes == NULL ||
+ !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+ goto Error;
+ }
+ // Free combined histograms.
+ VP8LFreeHistogramSet(histogram_image);
+ histogram_image = NULL;
+
+ // Free scratch histograms.
+ VP8LFreeHistogram(tmp_histo);
+ tmp_histo = NULL;
+
+ // Color Cache parameters.
+ if (cache_bits_tmp > 0) {
+ VP8LPutBits(bw, 1, 1);
+ VP8LPutBits(bw, cache_bits_tmp, 4);
+ } else {
+ VP8LPutBits(bw, 0, 1);
+ }
+
+ // Huffman image + meta huffman.
+ write_histogram_image = (histogram_image_size > 1);
+ VP8LPutBits(bw, write_histogram_image, 1);
+ if (write_histogram_image) {
+ uint32_t* const histogram_argb =
+ (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
+ sizeof(*histogram_argb));
+ int max_index = 0;
+ uint32_t i;
+ if (histogram_argb == NULL) goto Error;
+ for (i = 0; i < histogram_image_xysize; ++i) {
+ const int symbol_index = histogram_symbols[i] & 0xffff;
+ histogram_argb[i] = (symbol_index << 8);
+ if (symbol_index >= max_index) {
+ max_index = symbol_index + 1;
+ }
+ }
+ histogram_image_size = max_index;
+
+ VP8LPutBits(bw, histogram_bits - 2, 3);
+ err = EncodeImageNoHuffman(
+ bw, histogram_argb, &hash_chain_histogram, &refs_array[2],
+ VP8LSubSampleSize(width, histogram_bits),
+ VP8LSubSampleSize(height, histogram_bits), quality, low_effort);
+ WebPSafeFree(histogram_argb);
+ if (err != VP8_ENC_OK) goto Error;
+ }
+
+ // Store Huffman codes.
+ {
+ int i;
+ int max_tokens = 0;
+ // Find maximum number of symbols for the huffman tree-set.
+ for (i = 0; i < 5 * histogram_image_size; ++i) {
+ HuffmanTreeCode* const codes = &huffman_codes[i];
+ if (max_tokens < codes->num_symbols) {
+ max_tokens = codes->num_symbols;
+ }
+ }
+ tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+ if (tokens == NULL) goto Error;
+ for (i = 0; i < 5 * histogram_image_size; ++i) {
+ HuffmanTreeCode* const codes = &huffman_codes[i];
+ StoreHuffmanCode(bw, huff_tree, tokens, codes);
+ ClearHuffmanTreeIfOnlyOneSymbol(codes);
+ }
+ }
+ // Store actual literals.
+ hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
+ err = StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
+ histogram_symbols, huffman_codes);
+ if (err != VP8_ENC_OK) goto Error;
+ // Keep track of the smallest image so far.
+ if (VP8LBitWriterNumBytes(bw) < bw_size_best) {
+ bw_size_best = VP8LBitWriterNumBytes(bw);
+ *cache_bits = cache_bits_tmp;
+ *hdr_size = hdr_size_tmp;
+ *data_size =
+ (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
+ VP8LBitWriterSwap(bw, &bw_best);
+ }
+ WebPSafeFree(tokens);
+ tokens = NULL;
+ if (huffman_codes != NULL) {
+ WebPSafeFree(huffman_codes->codes);
+ WebPSafeFree(huffman_codes);
+ huffman_codes = NULL;
+ }
+ }
+ }
+ VP8LBitWriterSwap(bw, &bw_best);
+ err = VP8_ENC_OK;
+
+ Error:
+ WebPSafeFree(tokens);
+ WebPSafeFree(huff_tree);
+ VP8LFreeHistogramSet(histogram_image);
+ VP8LFreeHistogram(tmp_histo);
+ VP8LHashChainClear(&hash_chain_histogram);
+ if (huffman_codes != NULL) {
+ WebPSafeFree(huffman_codes->codes);
+ WebPSafeFree(huffman_codes);
+ }
+ WebPSafeFree(histogram_symbols);
+ VP8LBitWriterWipeOut(&bw_best);
+ return err;
+}
+
+// -----------------------------------------------------------------------------
+// Transforms
+
+static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
+ VP8LBitWriter* const bw) {
+ VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+ VP8LPutBits(bw, SUBTRACT_GREEN, 2);
+ VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
+}
+
+static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
+ int width, int height,
+ int quality, int low_effort,
+ int used_subtract_green,
+ VP8LBitWriter* const bw) {
+ const int pred_bits = enc->transform_bits_;
+ const int transform_width = VP8LSubSampleSize(width, pred_bits);
+ const int transform_height = VP8LSubSampleSize(height, pred_bits);
+ // we disable near-lossless quantization if palette is used.
+ const int near_lossless_strength = enc->use_palette_ ? 100
+ : enc->config_->near_lossless;
+
+ VP8LResidualImage(width, height, pred_bits, low_effort, enc->argb_,
+ enc->argb_scratch_, enc->transform_data_,
+ near_lossless_strength, enc->config_->exact,
+ used_subtract_green);
+ VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+ VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
+ assert(pred_bits >= 2);
+ VP8LPutBits(bw, pred_bits - 2, 3);
+ return EncodeImageNoHuffman(
+ bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+ (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+ quality, low_effort);
+}
+
+static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
+ int width, int height,
+ int quality, int low_effort,
+ VP8LBitWriter* const bw) {
+ const int ccolor_transform_bits = enc->transform_bits_;
+ const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
+ const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
+
+ VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
+ enc->argb_, enc->transform_data_);
+ VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+ VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
+ assert(ccolor_transform_bits >= 2);
+ VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
+ return EncodeImageNoHuffman(
+ bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+ (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+ quality, low_effort);
+}
+
+// -----------------------------------------------------------------------------
+
+static WebPEncodingError WriteRiffHeader(const WebPPicture* const pic,
+ size_t riff_size, size_t vp8l_size) {
+ uint8_t riff[RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + VP8L_SIGNATURE_SIZE] = {
+ 'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P',
+ 'V', 'P', '8', 'L', 0, 0, 0, 0, VP8L_MAGIC_BYTE,
+ };
+ PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+ PutLE32(riff + RIFF_HEADER_SIZE + TAG_SIZE, (uint32_t)vp8l_size);
+ if (!pic->writer(riff, sizeof(riff), pic)) {
+ return VP8_ENC_ERROR_BAD_WRITE;
+ }
+ return VP8_ENC_OK;
+}
+
+static int WriteImageSize(const WebPPicture* const pic,
+ VP8LBitWriter* const bw) {
+ const int width = pic->width - 1;
+ const int height = pic->height - 1;
+ assert(width < WEBP_MAX_DIMENSION && height < WEBP_MAX_DIMENSION);
+
+ VP8LPutBits(bw, width, VP8L_IMAGE_SIZE_BITS);
+ VP8LPutBits(bw, height, VP8L_IMAGE_SIZE_BITS);
+ return !bw->error_;
+}
+
+static int WriteRealAlphaAndVersion(VP8LBitWriter* const bw, int has_alpha) {
+ VP8LPutBits(bw, has_alpha, 1);
+ VP8LPutBits(bw, VP8L_VERSION, VP8L_VERSION_BITS);
+ return !bw->error_;
+}
+
+static WebPEncodingError WriteImage(const WebPPicture* const pic,
+ VP8LBitWriter* const bw,
+ size_t* const coded_size) {
+ WebPEncodingError err = VP8_ENC_OK;
+ const uint8_t* const webpll_data = VP8LBitWriterFinish(bw);
+ const size_t webpll_size = VP8LBitWriterNumBytes(bw);
+ const size_t vp8l_size = VP8L_SIGNATURE_SIZE + webpll_size;
+ const size_t pad = vp8l_size & 1;
+ const size_t riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8l_size + pad;
+
+ err = WriteRiffHeader(pic, riff_size, vp8l_size);
+ if (err != VP8_ENC_OK) goto Error;
+
+ if (!pic->writer(webpll_data, webpll_size, pic)) {
+ err = VP8_ENC_ERROR_BAD_WRITE;
+ goto Error;
+ }
+
+ if (pad) {
+ const uint8_t pad_byte[1] = { 0 };
+ if (!pic->writer(pad_byte, 1, pic)) {
+ err = VP8_ENC_ERROR_BAD_WRITE;
+ goto Error;
+ }
+ }
+ *coded_size = CHUNK_HEADER_SIZE + riff_size;
+ return VP8_ENC_OK;
+
+ Error:
+ return err;
+}
+
+// -----------------------------------------------------------------------------
+
+static void ClearTransformBuffer(VP8LEncoder* const enc) {
+ WebPSafeFree(enc->transform_mem_);
+ enc->transform_mem_ = NULL;
+ enc->transform_mem_size_ = 0;
+}
+
+// Allocates the memory for argb (W x H) buffer, 2 rows of context for
+// prediction and transform data.
+// Flags influencing the memory allocated:
+// enc->transform_bits_
+// enc->use_predict_, enc->use_cross_color_
+static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
+ int width, int height) {
+ WebPEncodingError err = VP8_ENC_OK;
+ const uint64_t image_size = width * height;
+ // VP8LResidualImage needs room for 2 scanlines of uint32 pixels with an extra
+ // pixel in each, plus 2 regular scanlines of bytes.
+ // TODO(skal): Clean up by using arithmetic in bytes instead of words.
+ const uint64_t argb_scratch_size =
+ enc->use_predict_
+ ? (width + 1) * 2 +
+ (width * 2 + sizeof(uint32_t) - 1) / sizeof(uint32_t)
+ : 0;
+ const uint64_t transform_data_size =
+ (enc->use_predict_ || enc->use_cross_color_)
+ ? VP8LSubSampleSize(width, enc->transform_bits_) *
+ VP8LSubSampleSize(height, enc->transform_bits_)
+ : 0;
+ const uint64_t max_alignment_in_words =
+ (WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
+ const uint64_t mem_size =
+ image_size + max_alignment_in_words +
+ argb_scratch_size + max_alignment_in_words +
+ transform_data_size;
+ uint32_t* mem = enc->transform_mem_;
+ if (mem == NULL || mem_size > enc->transform_mem_size_) {
+ ClearTransformBuffer(enc);
+ mem = (uint32_t*)WebPSafeMalloc(mem_size, sizeof(*mem));
+ if (mem == NULL) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+ enc->transform_mem_ = mem;
+ enc->transform_mem_size_ = (size_t)mem_size;
+ enc->argb_content_ = kEncoderNone;
+ }
+ enc->argb_ = mem;
+ mem = (uint32_t*)WEBP_ALIGN(mem + image_size);
+ enc->argb_scratch_ = mem;
+ mem = (uint32_t*)WEBP_ALIGN(mem + argb_scratch_size);
+ enc->transform_data_ = mem;
+
+ enc->current_width_ = width;
+ Error:
+ return err;
+}
+
+static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
+ WebPEncodingError err = VP8_ENC_OK;
+ const WebPPicture* const picture = enc->pic_;
+ const int width = picture->width;
+ const int height = picture->height;
+
+ err = AllocateTransformBuffer(enc, width, height);
+ if (err != VP8_ENC_OK) return err;
+ if (enc->argb_content_ == kEncoderARGB) return VP8_ENC_OK;
+
+ {
+ uint32_t* dst = enc->argb_;
+ const uint32_t* src = picture->argb;
+ int y;
+ for (y = 0; y < height; ++y) {
+ memcpy(dst, src, width * sizeof(*dst));
+ dst += width;
+ src += picture->argb_stride;
+ }
+ }
+ enc->argb_content_ = kEncoderARGB;
+ assert(enc->current_width_ == width);
+ return VP8_ENC_OK;
+}
+
+// -----------------------------------------------------------------------------
+
+#define APPLY_PALETTE_GREEDY_MAX 4
+
+static WEBP_INLINE uint32_t SearchColorGreedy(const uint32_t palette[],
+ int palette_size,
+ uint32_t color) {
+ (void)palette_size;
+ assert(palette_size < APPLY_PALETTE_GREEDY_MAX);
+ assert(3 == APPLY_PALETTE_GREEDY_MAX - 1);
+ if (color == palette[0]) return 0;
+ if (color == palette[1]) return 1;
+ if (color == palette[2]) return 2;
+ return 3;
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash0(uint32_t color) {
+ // Focus on the green color.
+ return (color >> 8) & 0xff;
+}
+
+#define PALETTE_INV_SIZE_BITS 11
+#define PALETTE_INV_SIZE (1 << PALETTE_INV_SIZE_BITS)
+
+static WEBP_INLINE uint32_t ApplyPaletteHash1(uint32_t color) {
+ // Forget about alpha.
+ return ((uint32_t)((color & 0x00ffffffu) * 4222244071ull)) >>
+ (32 - PALETTE_INV_SIZE_BITS);
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
+ // Forget about alpha.
+ return ((uint32_t)((color & 0x00ffffffu) * ((1ull << 31) - 1))) >>
+ (32 - PALETTE_INV_SIZE_BITS);
+}
+
+// Use 1 pixel cache for ARGB pixels.
+#define APPLY_PALETTE_FOR(COLOR_INDEX) do { \
+ uint32_t prev_pix = palette[0]; \
+ uint32_t prev_idx = 0; \
+ for (y = 0; y < height; ++y) { \
+ for (x = 0; x < width; ++x) { \
+ const uint32_t pix = src[x]; \
+ if (pix != prev_pix) { \
+ prev_idx = COLOR_INDEX; \
+ prev_pix = pix; \
+ } \
+ tmp_row[x] = prev_idx; \
+ } \
+ VP8LBundleColorMap(tmp_row, width, xbits, dst); \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+} while (0)
+
+// Remap argb values in src[] to packed palettes entries in dst[]
+// using 'row' as a temporary buffer of size 'width'.
+// We assume that all src[] values have a corresponding entry in the palette.
+// Note: src[] can be the same as dst[]
+static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
+ uint32_t* dst, uint32_t dst_stride,
+ const uint32_t* palette, int palette_size,
+ int width, int height, int xbits) {
+ // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
+ // made to work in-place.
+ uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
+ int x, y;
+
+ if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+
+ if (palette_size < APPLY_PALETTE_GREEDY_MAX) {
+ APPLY_PALETTE_FOR(SearchColorGreedy(palette, palette_size, pix));
+ } else {
+ int i, j;
+ uint16_t buffer[PALETTE_INV_SIZE];
+ uint32_t (*const hash_functions[])(uint32_t) = {
+ ApplyPaletteHash0, ApplyPaletteHash1, ApplyPaletteHash2
+ };
+
+ // Try to find a perfect hash function able to go from a color to an index
+ // within 1 << PALETTE_INV_SIZE_BITS in order to build a hash map to go
+ // from color to index in palette.
+ for (i = 0; i < 3; ++i) {
+ int use_LUT = 1;
+ // Set each element in buffer to max uint16_t.
+ memset(buffer, 0xff, sizeof(buffer));
+ for (j = 0; j < palette_size; ++j) {
+ const uint32_t ind = hash_functions[i](palette[j]);
+ if (buffer[ind] != 0xffffu) {
+ use_LUT = 0;
+ break;
+ } else {
+ buffer[ind] = j;
+ }
+ }
+ if (use_LUT) break;
+ }
+
+ if (i == 0) {
+ APPLY_PALETTE_FOR(buffer[ApplyPaletteHash0(pix)]);
+ } else if (i == 1) {
+ APPLY_PALETTE_FOR(buffer[ApplyPaletteHash1(pix)]);
+ } else if (i == 2) {
+ APPLY_PALETTE_FOR(buffer[ApplyPaletteHash2(pix)]);
+ } else {
+ uint32_t idx_map[MAX_PALETTE_SIZE];
+ uint32_t palette_sorted[MAX_PALETTE_SIZE];
+ PrepareMapToPalette(palette, palette_size, palette_sorted, idx_map);
+ APPLY_PALETTE_FOR(
+ idx_map[SearchColorNoIdx(palette_sorted, pix, palette_size)]);
+ }
+ }
+ WebPSafeFree(tmp_row);
+ return VP8_ENC_OK;
+}
+#undef APPLY_PALETTE_FOR
+#undef PALETTE_INV_SIZE_BITS
+#undef PALETTE_INV_SIZE
+#undef APPLY_PALETTE_GREEDY_MAX
+
+// Note: Expects "enc->palette_" to be set properly.
+static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
+ int in_place) {
+ WebPEncodingError err = VP8_ENC_OK;
+ const WebPPicture* const pic = enc->pic_;
+ const int width = pic->width;
+ const int height = pic->height;
+ const uint32_t* const palette = enc->palette_;
+ const uint32_t* src = in_place ? enc->argb_ : pic->argb;
+ const int src_stride = in_place ? enc->current_width_ : pic->argb_stride;
+ const int palette_size = enc->palette_size_;
+ int xbits;
+
+ // Replace each input pixel by corresponding palette index.
+ // This is done line by line.
+ if (palette_size <= 4) {
+ xbits = (palette_size <= 2) ? 3 : 2;
+ } else {
+ xbits = (palette_size <= 16) ? 1 : 0;
+ }
+
+ err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
+ if (err != VP8_ENC_OK) return err;
+
+ err = ApplyPalette(src, src_stride,
+ enc->argb_, enc->current_width_,
+ palette, palette_size, width, height, xbits);
+ enc->argb_content_ = kEncoderPalette;
+ return err;
+}
+
+// Save palette_[] to bitstream.
+static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
+ VP8LEncoder* const enc) {
+ int i;
+ uint32_t tmp_palette[MAX_PALETTE_SIZE];
+ const int palette_size = enc->palette_size_;
+ const uint32_t* const palette = enc->palette_;
+ VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+ VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2);
+ assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE);
+ VP8LPutBits(bw, palette_size - 1, 8);
+ for (i = palette_size - 1; i >= 1; --i) {
+ tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
+ }
+ tmp_palette[0] = palette[0];
+ return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
+ &enc->refs_[0], palette_size, 1, /*quality=*/20,
+ low_effort);
+}
+
+// -----------------------------------------------------------------------------
+// VP8LEncoder
+
+static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
+ const WebPPicture* const picture) {
+ VP8LEncoder* const enc = (VP8LEncoder*)WebPSafeCalloc(1ULL, sizeof(*enc));
+ if (enc == NULL) {
+ WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+ return NULL;
+ }
+ enc->config_ = config;
+ enc->pic_ = picture;
+ enc->argb_content_ = kEncoderNone;
+
+ VP8LEncDspInit();
+
+ return enc;
+}
+
+static void VP8LEncoderDelete(VP8LEncoder* enc) {
+ if (enc != NULL) {
+ int i;
+ VP8LHashChainClear(&enc->hash_chain_);
+ for (i = 0; i < 4; ++i) VP8LBackwardRefsClear(&enc->refs_[i]);
+ ClearTransformBuffer(enc);
+ WebPSafeFree(enc);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Main call
+
+typedef struct {
+ const WebPConfig* config_;
+ const WebPPicture* picture_;
+ VP8LBitWriter* bw_;
+ VP8LEncoder* enc_;
+ int use_cache_;
+ CrunchConfig crunch_configs_[CRUNCH_CONFIGS_MAX];
+ int num_crunch_configs_;
+ int red_and_blue_always_zero_;
+ WebPEncodingError err_;
+ WebPAuxStats* stats_;
+} StreamEncodeContext;
+
+static int EncodeStreamHook(void* input, void* data2) {
+ StreamEncodeContext* const params = (StreamEncodeContext*)input;
+ const WebPConfig* const config = params->config_;
+ const WebPPicture* const picture = params->picture_;
+ VP8LBitWriter* const bw = params->bw_;
+ VP8LEncoder* const enc = params->enc_;
+ const int use_cache = params->use_cache_;
+ const CrunchConfig* const crunch_configs = params->crunch_configs_;
+ const int num_crunch_configs = params->num_crunch_configs_;
+ const int red_and_blue_always_zero = params->red_and_blue_always_zero_;
+#if !defined(WEBP_DISABLE_STATS)
+ WebPAuxStats* const stats = params->stats_;
+#endif
+ WebPEncodingError err = VP8_ENC_OK;
+ const int quality = (int)config->quality;
+ const int low_effort = (config->method == 0);
+#if (WEBP_NEAR_LOSSLESS == 1)
+ const int width = picture->width;
+#endif
+ const int height = picture->height;
+ const size_t byte_position = VP8LBitWriterNumBytes(bw);
+#if (WEBP_NEAR_LOSSLESS == 1)
+ int use_near_lossless = 0;
+#endif
+ int hdr_size = 0;
+ int data_size = 0;
+ int use_delta_palette = 0;
+ int idx;
+ size_t best_size = ~(size_t)0;
+ VP8LBitWriter bw_init = *bw, bw_best;
+ (void)data2;
+
+ if (!VP8LBitWriterInit(&bw_best, 0) ||
+ (num_crunch_configs > 1 && !VP8LBitWriterClone(bw, &bw_best))) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+
+ for (idx = 0; idx < num_crunch_configs; ++idx) {
+ const int entropy_idx = crunch_configs[idx].entropy_idx_;
+ enc->use_palette_ =
+ (entropy_idx == kPalette) || (entropy_idx == kPaletteAndSpatial);
+ enc->use_subtract_green_ =
+ (entropy_idx == kSubGreen) || (entropy_idx == kSpatialSubGreen);
+ enc->use_predict_ = (entropy_idx == kSpatial) ||
+ (entropy_idx == kSpatialSubGreen) ||
+ (entropy_idx == kPaletteAndSpatial);
+ // When using a palette, R/B==0, hence no need to test for cross-color.
+ if (low_effort || enc->use_palette_) {
+ enc->use_cross_color_ = 0;
+ } else {
+ enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
+ }
+ // Reset any parameter in the encoder that is set in the previous iteration.
+ enc->cache_bits_ = 0;
+ VP8LBackwardRefsClear(&enc->refs_[0]);
+ VP8LBackwardRefsClear(&enc->refs_[1]);
+
+#if (WEBP_NEAR_LOSSLESS == 1)
+ // Apply near-lossless preprocessing.
+ use_near_lossless = (config->near_lossless < 100) && !enc->use_palette_ &&
+ !enc->use_predict_;
+ if (use_near_lossless) {
+ err = AllocateTransformBuffer(enc, width, height);
+ if (err != VP8_ENC_OK) goto Error;
+ if ((enc->argb_content_ != kEncoderNearLossless) &&
+ !VP8ApplyNearLossless(picture, config->near_lossless, enc->argb_)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+ enc->argb_content_ = kEncoderNearLossless;
+ } else {
+ enc->argb_content_ = kEncoderNone;
+ }
+#else
+ enc->argb_content_ = kEncoderNone;
+#endif
+
+ // Encode palette
+ if (enc->use_palette_) {
+ if (crunch_configs[idx].palette_sorting_type_ == kSortedDefault) {
+ // Nothing to do, we have already sorted the palette.
+ memcpy(enc->palette_, enc->palette_sorted_,
+ enc->palette_size_ * sizeof(*enc->palette_));
+ } else if (crunch_configs[idx].palette_sorting_type_ == kMinimizeDelta) {
+ PaletteSortMinimizeDeltas(enc->palette_sorted_, enc->palette_size_,
+ enc->palette_);
+ } else {
+ assert(crunch_configs[idx].palette_sorting_type_ == kModifiedZeng);
+ err = PaletteSortModifiedZeng(enc->pic_, enc->palette_sorted_,
+ enc->palette_size_, enc->palette_);
+ if (err != VP8_ENC_OK) goto Error;
+ }
+ err = EncodePalette(bw, low_effort, enc);
+ if (err != VP8_ENC_OK) goto Error;
+ err = MapImageFromPalette(enc, use_delta_palette);
+ if (err != VP8_ENC_OK) goto Error;
+ // If using a color cache, do not have it bigger than the number of
+ // colors.
+ if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
+ enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
+ }
+ }
+ if (!use_delta_palette) {
+ // In case image is not packed.
+ if (enc->argb_content_ != kEncoderNearLossless &&
+ enc->argb_content_ != kEncoderPalette) {
+ err = MakeInputImageCopy(enc);
+ if (err != VP8_ENC_OK) goto Error;
+ }
+
+ // -----------------------------------------------------------------------
+ // Apply transforms and write transform data.
+
+ if (enc->use_subtract_green_) {
+ ApplySubtractGreen(enc, enc->current_width_, height, bw);
+ }
+
+ if (enc->use_predict_) {
+ err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
+ low_effort, enc->use_subtract_green_, bw);
+ if (err != VP8_ENC_OK) goto Error;
+ }
+
+ if (enc->use_cross_color_) {
+ err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
+ low_effort, bw);
+ if (err != VP8_ENC_OK) goto Error;
+ }
+ }
+
+ VP8LPutBits(bw, !TRANSFORM_PRESENT, 1); // No more transforms.
+
+ // -------------------------------------------------------------------------
+ // Encode and write the transformed image.
+ err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
+ enc->current_width_, height, quality, low_effort,
+ use_cache, &crunch_configs[idx],
+ &enc->cache_bits_, enc->histo_bits_,
+ byte_position, &hdr_size, &data_size);
+ if (err != VP8_ENC_OK) goto Error;
+
+ // If we are better than what we already have.
+ if (VP8LBitWriterNumBytes(bw) < best_size) {
+ best_size = VP8LBitWriterNumBytes(bw);
+ // Store the BitWriter.
+ VP8LBitWriterSwap(bw, &bw_best);
+#if !defined(WEBP_DISABLE_STATS)
+ // Update the stats.
+ if (stats != NULL) {
+ stats->lossless_features = 0;
+ if (enc->use_predict_) stats->lossless_features |= 1;
+ if (enc->use_cross_color_) stats->lossless_features |= 2;
+ if (enc->use_subtract_green_) stats->lossless_features |= 4;
+ if (enc->use_palette_) stats->lossless_features |= 8;
+ stats->histogram_bits = enc->histo_bits_;
+ stats->transform_bits = enc->transform_bits_;
+ stats->cache_bits = enc->cache_bits_;
+ stats->palette_size = enc->palette_size_;
+ stats->lossless_size = (int)(best_size - byte_position);
+ stats->lossless_hdr_size = hdr_size;
+ stats->lossless_data_size = data_size;
+ }
+#endif
+ }
+ // Reset the bit writer for the following iteration if any.
+ if (num_crunch_configs > 1) VP8LBitWriterReset(&bw_init, bw);
+ }
+ VP8LBitWriterSwap(&bw_best, bw);
+
+Error:
+ VP8LBitWriterWipeOut(&bw_best);
+ params->err_ = err;
+ // The hook should return false in case of error.
+ return (err == VP8_ENC_OK);
+}
+
+WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
+ const WebPPicture* const picture,
+ VP8LBitWriter* const bw_main,
+ int use_cache) {
+ WebPEncodingError err = VP8_ENC_OK;
+ VP8LEncoder* const enc_main = VP8LEncoderNew(config, picture);
+ VP8LEncoder* enc_side = NULL;
+ CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX];
+ int num_crunch_configs_main, num_crunch_configs_side = 0;
+ int idx;
+ int red_and_blue_always_zero = 0;
+ WebPWorker worker_main, worker_side;
+ StreamEncodeContext params_main, params_side;
+ // The main thread uses picture->stats, the side thread uses stats_side.
+ WebPAuxStats stats_side;
+ VP8LBitWriter bw_side;
+ const WebPWorkerInterface* const worker_interface = WebPGetWorkerInterface();
+ int ok_main;
+
+ // Analyze image (entropy, num_palettes etc)
+ if (enc_main == NULL ||
+ !EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main,
+ &red_and_blue_always_zero) ||
+ !EncoderInit(enc_main) || !VP8LBitWriterInit(&bw_side, 0)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+
+ // Split the configs between the main and side threads (if any).
+ if (config->thread_level > 0) {
+ num_crunch_configs_side = num_crunch_configs_main / 2;
+ for (idx = 0; idx < num_crunch_configs_side; ++idx) {
+ params_side.crunch_configs_[idx] =
+ crunch_configs[num_crunch_configs_main - num_crunch_configs_side +
+ idx];
+ }
+ params_side.num_crunch_configs_ = num_crunch_configs_side;
+ }
+ num_crunch_configs_main -= num_crunch_configs_side;
+ for (idx = 0; idx < num_crunch_configs_main; ++idx) {
+ params_main.crunch_configs_[idx] = crunch_configs[idx];
+ }
+ params_main.num_crunch_configs_ = num_crunch_configs_main;
+
+ // Fill in the parameters for the thread workers.
+ {
+ const int params_size = (num_crunch_configs_side > 0) ? 2 : 1;
+ for (idx = 0; idx < params_size; ++idx) {
+ // Create the parameters for each worker.
+ WebPWorker* const worker = (idx == 0) ? &worker_main : &worker_side;
+ StreamEncodeContext* const param =
+ (idx == 0) ? &params_main : &params_side;
+ param->config_ = config;
+ param->picture_ = picture;
+ param->use_cache_ = use_cache;
+ param->red_and_blue_always_zero_ = red_and_blue_always_zero;
+ if (idx == 0) {
+ param->stats_ = picture->stats;
+ param->bw_ = bw_main;
+ param->enc_ = enc_main;
+ } else {
+ param->stats_ = (picture->stats == NULL) ? NULL : &stats_side;
+ // Create a side bit writer.
+ if (!VP8LBitWriterClone(bw_main, &bw_side)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+ param->bw_ = &bw_side;
+ // Create a side encoder.
+ enc_side = VP8LEncoderNew(config, picture);
+ if (enc_side == NULL || !EncoderInit(enc_side)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+ // Copy the values that were computed for the main encoder.
+ enc_side->histo_bits_ = enc_main->histo_bits_;
+ enc_side->transform_bits_ = enc_main->transform_bits_;
+ enc_side->palette_size_ = enc_main->palette_size_;
+ memcpy(enc_side->palette_, enc_main->palette_,
+ sizeof(enc_main->palette_));
+ memcpy(enc_side->palette_sorted_, enc_main->palette_sorted_,
+ sizeof(enc_main->palette_sorted_));
+ param->enc_ = enc_side;
+ }
+ // Create the workers.
+ worker_interface->Init(worker);
+ worker->data1 = param;
+ worker->data2 = NULL;
+ worker->hook = EncodeStreamHook;
+ }
+ }
+
+ // Start the second thread if needed.
+ if (num_crunch_configs_side != 0) {
+ if (!worker_interface->Reset(&worker_side)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+#if !defined(WEBP_DISABLE_STATS)
+ // This line is here and not in the param initialization above to remove a
+ // Clang static analyzer warning.
+ if (picture->stats != NULL) {
+ memcpy(&stats_side, picture->stats, sizeof(stats_side));
+ }
+#endif
+ // This line is only useful to remove a Clang static analyzer warning.
+ params_side.err_ = VP8_ENC_OK;
+ worker_interface->Launch(&worker_side);
+ }
+ // Execute the main thread.
+ worker_interface->Execute(&worker_main);
+ ok_main = worker_interface->Sync(&worker_main);
+ worker_interface->End(&worker_main);
+ if (num_crunch_configs_side != 0) {
+ // Wait for the second thread.
+ const int ok_side = worker_interface->Sync(&worker_side);
+ worker_interface->End(&worker_side);
+ if (!ok_main || !ok_side) {
+ err = ok_main ? params_side.err_ : params_main.err_;
+ goto Error;
+ }
+ if (VP8LBitWriterNumBytes(&bw_side) < VP8LBitWriterNumBytes(bw_main)) {
+ VP8LBitWriterSwap(bw_main, &bw_side);
+#if !defined(WEBP_DISABLE_STATS)
+ if (picture->stats != NULL) {
+ memcpy(picture->stats, &stats_side, sizeof(*picture->stats));
+ }
+#endif
+ }
+ } else {
+ if (!ok_main) {
+ err = params_main.err_;
+ goto Error;
+ }
+ }
+
+Error:
+ VP8LBitWriterWipeOut(&bw_side);
+ VP8LEncoderDelete(enc_main);
+ VP8LEncoderDelete(enc_side);
+ return err;
+}
+
+#undef CRUNCH_CONFIGS_MAX
+#undef CRUNCH_SUBCONFIGS_MAX
+
+int VP8LEncodeImage(const WebPConfig* const config,
+ const WebPPicture* const picture) {
+ int width, height;
+ int has_alpha;
+ size_t coded_size;
+ int percent = 0;
+ int initial_size;
+ WebPEncodingError err = VP8_ENC_OK;
+ VP8LBitWriter bw;
+
+ if (picture == NULL) return 0;
+
+ if (config == NULL || picture->argb == NULL) {
+ err = VP8_ENC_ERROR_NULL_PARAMETER;
+ WebPEncodingSetError(picture, err);
+ return 0;
+ }
+
+ width = picture->width;
+ height = picture->height;
+ // Initialize BitWriter with size corresponding to 16 bpp to photo images and
+ // 8 bpp for graphical images.
+ initial_size = (config->image_hint == WEBP_HINT_GRAPH) ?
+ width * height : width * height * 2;
+ if (!VP8LBitWriterInit(&bw, initial_size)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+
+ if (!WebPReportProgress(picture, 1, &percent)) {
+ UserAbort:
+ err = VP8_ENC_ERROR_USER_ABORT;
+ goto Error;
+ }
+ // Reset stats (for pure lossless coding)
+ if (picture->stats != NULL) {
+ WebPAuxStats* const stats = picture->stats;
+ memset(stats, 0, sizeof(*stats));
+ stats->PSNR[0] = 99.f;
+ stats->PSNR[1] = 99.f;
+ stats->PSNR[2] = 99.f;
+ stats->PSNR[3] = 99.f;
+ stats->PSNR[4] = 99.f;
+ }
+
+ // Write image size.
+ if (!WriteImageSize(picture, &bw)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+
+ has_alpha = WebPPictureHasTransparency(picture);
+ // Write the non-trivial Alpha flag and lossless version.
+ if (!WriteRealAlphaAndVersion(&bw, has_alpha)) {
+ err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ goto Error;
+ }
+
+ if (!WebPReportProgress(picture, 5, &percent)) goto UserAbort;
+
+ // Encode main image stream.
+ err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
+ if (err != VP8_ENC_OK) goto Error;
+
+ if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;
+
+ // Finish the RIFF chunk.
+ err = WriteImage(picture, &bw, &coded_size);
+ if (err != VP8_ENC_OK) goto Error;
+
+ if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;
+
+#if !defined(WEBP_DISABLE_STATS)
+ // Save size.
+ if (picture->stats != NULL) {
+ picture->stats->coded_size += (int)coded_size;
+ picture->stats->lossless_size = (int)coded_size;
+ }
+#endif
+
+ if (picture->extra_info != NULL) {
+ const int mb_w = (width + 15) >> 4;
+ const int mb_h = (height + 15) >> 4;
+ memset(picture->extra_info, 0, mb_w * mb_h * sizeof(*picture->extra_info));
+ }
+
+ Error:
+ if (bw.error_) err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+ VP8LBitWriterWipeOut(&bw);
+ if (err != VP8_ENC_OK) {
+ WebPEncodingSetError(picture, err);
+ return 0;
+ }
+ return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/vp8li_enc.h b/media/libwebp/enc/vp8li_enc.h
index 1e259eda77..f6f8cf6403 100644
--- a/media/libwebp/enc/vp8li_enc.h
+++ b/media/libwebp/enc/vp8li_enc.h
@@ -69,9 +69,11 @@ typedef struct {
int use_palette_;
int palette_size_;
uint32_t palette_[MAX_PALETTE_SIZE];
+ // Sorted version of palette_ for cache purposes.
+ uint32_t palette_sorted_[MAX_PALETTE_SIZE];
// Some 'scratch' (potentially large) objects.
- struct VP8LBackwardRefs refs_[3]; // Backward Refs array for temporaries.
+ struct VP8LBackwardRefs refs_[4]; // Backward Refs array for temporaries.
VP8LHashChain hash_chain_; // HashChain data for constructing
// backward references.
} VP8LEncoder;
diff --git a/media/libwebp/enc/webp_enc.c b/media/libwebp/enc/webp_enc.c
new file mode 100644
index 0000000000..47ba405f5d
--- /dev/null
+++ b/media/libwebp/enc/webp_enc.c
@@ -0,0 +1,410 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebP encoder: main entry point
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/vp8li_enc.h"
+#include "../utils/utils.h"
+
+// #define PRINT_MEMORY_INFO
+
+#ifdef PRINT_MEMORY_INFO
+#include <stdio.h>
+#endif
+
+//------------------------------------------------------------------------------
+
+int WebPGetEncoderVersion(void) {
+ return (ENC_MAJ_VERSION << 16) | (ENC_MIN_VERSION << 8) | ENC_REV_VERSION;
+}
+
+//------------------------------------------------------------------------------
+// VP8Encoder
+//------------------------------------------------------------------------------
+
+static void ResetSegmentHeader(VP8Encoder* const enc) {
+ VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
+ hdr->num_segments_ = enc->config_->segments;
+ hdr->update_map_ = (hdr->num_segments_ > 1);
+ hdr->size_ = 0;
+}
+
+static void ResetFilterHeader(VP8Encoder* const enc) {
+ VP8EncFilterHeader* const hdr = &enc->filter_hdr_;
+ hdr->simple_ = 1;
+ hdr->level_ = 0;
+ hdr->sharpness_ = 0;
+ hdr->i4x4_lf_delta_ = 0;
+}
+
+static void ResetBoundaryPredictions(VP8Encoder* const enc) {
+ // init boundary values once for all
+ // Note: actually, initializing the preds_[] is only needed for intra4.
+ int i;
+ uint8_t* const top = enc->preds_ - enc->preds_w_;
+ uint8_t* const left = enc->preds_ - 1;
+ for (i = -1; i < 4 * enc->mb_w_; ++i) {
+ top[i] = B_DC_PRED;
+ }
+ for (i = 0; i < 4 * enc->mb_h_; ++i) {
+ left[i * enc->preds_w_] = B_DC_PRED;
+ }
+ enc->nz_[-1] = 0; // constant
+}
+
+// Mapping from config->method_ to coding tools used.
+//-------------------+---+---+---+---+---+---+---+
+// Method | 0 | 1 | 2 | 3 |(4)| 5 | 6 |
+//-------------------+---+---+---+---+---+---+---+
+// fast probe | x | | | x | | | |
+//-------------------+---+---+---+---+---+---+---+
+// dynamic proba | ~ | x | x | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// fast mode analysis|[x]|[x]| | | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// basic rd-opt | | | | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// disto-refine i4/16| x | x | x | | | | |
+//-------------------+---+---+---+---+---+---+---+
+// disto-refine uv | | x | x | | | | |
+//-------------------+---+---+---+---+---+---+---+
+// rd-opt i4/16 | | | ~ | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// token buffer (opt)| | | | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// Trellis | | | | | | x |Ful|
+//-------------------+---+---+---+---+---+---+---+
+// full-SNS | | | | | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+
+static void MapConfigToTools(VP8Encoder* const enc) {
+ const WebPConfig* const config = enc->config_;
+ const int method = config->method;
+ const int limit = 100 - config->partition_limit;
+ enc->method_ = method;
+ enc->rd_opt_level_ = (method >= 6) ? RD_OPT_TRELLIS_ALL
+ : (method >= 5) ? RD_OPT_TRELLIS
+ : (method >= 3) ? RD_OPT_BASIC
+ : RD_OPT_NONE;
+ enc->max_i4_header_bits_ =
+ 256 * 16 * 16 * // upper bound: up to 16bit per 4x4 block
+ (limit * limit) / (100 * 100); // ... modulated with a quadratic curve.
+
+ // partition0 = 512k max.
+ enc->mb_header_limit_ =
+ (score_t)256 * 510 * 8 * 1024 / (enc->mb_w_ * enc->mb_h_);
+
+ enc->thread_level_ = config->thread_level;
+
+ enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
+ if (!config->low_memory) {
+#if !defined(DISABLE_TOKEN_BUFFER)
+ enc->use_tokens_ = (enc->rd_opt_level_ >= RD_OPT_BASIC); // need rd stats
+#endif
+ if (enc->use_tokens_) {
+ enc->num_parts_ = 1; // doesn't work with multi-partition
+ }
+ }
+}
+
+// Memory scaling with dimensions:
+// memory (bytes) ~= 2.25 * w + 0.0625 * w * h
+//
+// Typical memory footprint (614x440 picture)
+// encoder: 22111
+// info: 4368
+// preds: 17741
+// top samples: 1263
+// non-zero: 175
+// lf-stats: 0
+// total: 45658
+// Transient object sizes:
+// VP8EncIterator: 3360
+// VP8ModeScore: 872
+// VP8SegmentInfo: 732
+// VP8EncProba: 18352
+// LFStats: 2048
+// Picture size (yuv): 419328
+
+static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
+ WebPPicture* const picture) {
+ VP8Encoder* enc;
+ const int use_filter =
+ (config->filter_strength > 0) || (config->autofilter > 0);
+ const int mb_w = (picture->width + 15) >> 4;
+ const int mb_h = (picture->height + 15) >> 4;
+ const int preds_w = 4 * mb_w + 1;
+ const int preds_h = 4 * mb_h + 1;
+ const size_t preds_size = preds_w * preds_h * sizeof(*enc->preds_);
+ const int top_stride = mb_w * 16;
+ const size_t nz_size = (mb_w + 1) * sizeof(*enc->nz_) + WEBP_ALIGN_CST;
+ const size_t info_size = mb_w * mb_h * sizeof(*enc->mb_info_);
+ const size_t samples_size =
+ 2 * top_stride * sizeof(*enc->y_top_) // top-luma/u/v
+ + WEBP_ALIGN_CST; // align all
+ const size_t lf_stats_size =
+ config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
+ const size_t top_derr_size =
+ (config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ?
+ mb_w * sizeof(*enc->top_derr_) : 0;
+ uint8_t* mem;
+ const uint64_t size = (uint64_t)sizeof(*enc) // main struct
+ + WEBP_ALIGN_CST // cache alignment
+ + info_size // modes info
+ + preds_size // prediction modes
+ + samples_size // top/left samples
+ + top_derr_size // top diffusion error
+ + nz_size // coeff context bits
+ + lf_stats_size; // autofilter stats
+
+#ifdef PRINT_MEMORY_INFO
+ printf("===================================\n");
+ printf("Memory used:\n"
+ " encoder: %ld\n"
+ " info: %ld\n"
+ " preds: %ld\n"
+ " top samples: %ld\n"
+ " top diffusion: %ld\n"
+ " non-zero: %ld\n"
+ " lf-stats: %ld\n"
+ " total: %ld\n",
+ sizeof(*enc) + WEBP_ALIGN_CST, info_size,
+ preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
+ printf("Transient object sizes:\n"
+ " VP8EncIterator: %ld\n"
+ " VP8ModeScore: %ld\n"
+ " VP8SegmentInfo: %ld\n"
+ " VP8EncProba: %ld\n"
+ " LFStats: %ld\n",
+ sizeof(VP8EncIterator), sizeof(VP8ModeScore),
+ sizeof(VP8SegmentInfo), sizeof(VP8EncProba),
+ sizeof(LFStats));
+ printf("Picture size (yuv): %ld\n",
+ mb_w * mb_h * 384 * sizeof(uint8_t));
+ printf("===================================\n");
+#endif
+ mem = (uint8_t*)WebPSafeMalloc(size, sizeof(*mem));
+ if (mem == NULL) {
+ WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+ return NULL;
+ }
+ enc = (VP8Encoder*)mem;
+ mem = (uint8_t*)WEBP_ALIGN(mem + sizeof(*enc));
+ memset(enc, 0, sizeof(*enc));
+ enc->num_parts_ = 1 << config->partitions;
+ enc->mb_w_ = mb_w;
+ enc->mb_h_ = mb_h;
+ enc->preds_w_ = preds_w;
+ enc->mb_info_ = (VP8MBInfo*)mem;
+ mem += info_size;
+ enc->preds_ = mem + 1 + enc->preds_w_;
+ mem += preds_size;
+ enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem);
+ mem += nz_size;
+ enc->lf_stats_ = lf_stats_size ? (LFStats*)WEBP_ALIGN(mem) : NULL;
+ mem += lf_stats_size;
+
+ // top samples (all 16-aligned)
+ mem = (uint8_t*)WEBP_ALIGN(mem);
+ enc->y_top_ = mem;
+ enc->uv_top_ = enc->y_top_ + top_stride;
+ mem += 2 * top_stride;
+ enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
+ mem += top_derr_size;
+ assert(mem <= (uint8_t*)enc + size);
+
+ enc->config_ = config;
+ enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
+ enc->pic_ = picture;
+ enc->percent_ = 0;
+
+ MapConfigToTools(enc);
+ VP8EncDspInit();
+ VP8DefaultProbas(enc);
+ ResetSegmentHeader(enc);
+ ResetFilterHeader(enc);
+ ResetBoundaryPredictions(enc);
+ VP8EncDspCostInit();
+ VP8EncInitAlpha(enc);
+
+ // lower quality means smaller output -> we modulate a little the page
+ // size based on quality. This is just a crude 1rst-order prediction.
+ {
+ const float scale = 1.f + config->quality * 5.f / 100.f; // in [1,6]
+ VP8TBufferInit(&enc->tokens_, (int)(mb_w * mb_h * 4 * scale));
+ }
+ return enc;
+}
+
+static int DeleteVP8Encoder(VP8Encoder* enc) {
+ int ok = 1;
+ if (enc != NULL) {
+ ok = VP8EncDeleteAlpha(enc);
+ VP8TBufferClear(&enc->tokens_);
+ WebPSafeFree(enc);
+ }
+ return ok;
+}
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_DISABLE_STATS)
+static double GetPSNR(uint64_t err, uint64_t size) {
+ return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
+}
+
+static void FinalizePSNR(const VP8Encoder* const enc) {
+ WebPAuxStats* stats = enc->pic_->stats;
+ const uint64_t size = enc->sse_count_;
+ const uint64_t* const sse = enc->sse_;
+ stats->PSNR[0] = (float)GetPSNR(sse[0], size);
+ stats->PSNR[1] = (float)GetPSNR(sse[1], size / 4);
+ stats->PSNR[2] = (float)GetPSNR(sse[2], size / 4);
+ stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2);
+ stats->PSNR[4] = (float)GetPSNR(sse[3], size);
+}
+#endif // !defined(WEBP_DISABLE_STATS)
+
+static void StoreStats(VP8Encoder* const enc) {
+#if !defined(WEBP_DISABLE_STATS)
+ WebPAuxStats* const stats = enc->pic_->stats;
+ if (stats != NULL) {
+ int i, s;
+ for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
+ stats->segment_level[i] = enc->dqm_[i].fstrength_;
+ stats->segment_quant[i] = enc->dqm_[i].quant_;
+ for (s = 0; s <= 2; ++s) {
+ stats->residual_bytes[s][i] = enc->residual_bytes_[s][i];
+ }
+ }
+ FinalizePSNR(enc);
+ stats->coded_size = enc->coded_size_;
+ for (i = 0; i < 3; ++i) {
+ stats->block_count[i] = enc->block_count_[i];
+ }
+ }
+#else // defined(WEBP_DISABLE_STATS)
+ WebPReportProgress(enc->pic_, 100, &enc->percent_); // done!
+#endif // !defined(WEBP_DISABLE_STATS)
+}
+
+int WebPEncodingSetError(const WebPPicture* const pic,
+ WebPEncodingError error) {
+ assert((int)error < VP8_ENC_ERROR_LAST);
+ assert((int)error >= VP8_ENC_OK);
+ ((WebPPicture*)pic)->error_code = error;
+ return 0;
+}
+
+int WebPReportProgress(const WebPPicture* const pic,
+ int percent, int* const percent_store) {
+ if (percent_store != NULL && percent != *percent_store) {
+ *percent_store = percent;
+ if (pic->progress_hook && !pic->progress_hook(percent, pic)) {
+ // user abort requested
+ WebPEncodingSetError(pic, VP8_ENC_ERROR_USER_ABORT);
+ return 0;
+ }
+ }
+ return 1; // ok
+}
+//------------------------------------------------------------------------------
+
+int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
+ int ok = 0;
+ if (pic == NULL) return 0;
+
+ WebPEncodingSetError(pic, VP8_ENC_OK); // all ok so far
+ if (config == NULL) { // bad params
+ return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+ }
+ if (!WebPValidateConfig(config)) {
+ return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+ }
+ if (pic->width <= 0 || pic->height <= 0) {
+ return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
+ }
+ if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
+ return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
+ }
+
+ if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
+
+ if (!config->lossless) {
+ VP8Encoder* enc = NULL;
+
+ if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
+ // Make sure we have YUVA samples.
+ if (config->use_sharp_yuv || (config->preprocessing & 4)) {
+ if (!WebPPictureSharpARGBToYUVA(pic)) {
+ return 0;
+ }
+ } else {
+ float dithering = 0.f;
+ if (config->preprocessing & 2) {
+ const float x = config->quality / 100.f;
+ const float x2 = x * x;
+ // slowly decreasing from max dithering at low quality (q->0)
+ // to 0.5 dithering amplitude at high quality (q->100)
+ dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
+ }
+ if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
+ return 0;
+ }
+ }
+ }
+
+ if (!config->exact) {
+ WebPCleanupTransparentArea(pic);
+ }
+
+ enc = InitVP8Encoder(config, pic);
+ if (enc == NULL) return 0; // pic->error is already set.
+ // Note: each of the tasks below account for 20% in the progress report.
+ ok = VP8EncAnalyze(enc);
+
+ // Analysis is done, proceed to actual coding.
+ ok = ok && VP8EncStartAlpha(enc); // possibly done in parallel
+ if (!enc->use_tokens_) {
+ ok = ok && VP8EncLoop(enc);
+ } else {
+ ok = ok && VP8EncTokenLoop(enc);
+ }
+ ok = ok && VP8EncFinishAlpha(enc);
+
+ ok = ok && VP8EncWrite(enc);
+ StoreStats(enc);
+ if (!ok) {
+ VP8EncFreeBitWriters(enc);
+ }
+ ok &= DeleteVP8Encoder(enc); // must always be called, even if !ok
+ } else {
+ // Make sure we have ARGB samples.
+ if (pic->argb == NULL && !WebPPictureYUVAToARGB(pic)) {
+ return 0;
+ }
+
+ if (!config->exact) {
+ WebPReplaceTransparentPixels(pic, 0x000000);
+ }
+
+ ok = VP8LEncodeImage(config, pic); // Sets pic->error in case of problem.
+ }
+
+ return ok;
+}
diff --git a/media/libwebp/moz.build b/media/libwebp/moz.build
index 5450e2b47c..5580b9a3dc 100644
--- a/media/libwebp/moz.build
+++ b/media/libwebp/moz.build
@@ -9,6 +9,8 @@ with Files('**'):
EXPORTS.webp += [
'webp/decode.h',
'webp/demux.h',
+ 'webp/encode.h',
+ 'webp/format_constants.h',
'webp/mux_types.h',
'webp/types.h',
]
@@ -17,6 +19,7 @@ DIRS += [
'dec',
'demux',
'dsp',
+ 'enc',
'moz',
'utils',
]
diff --git a/media/libwebp/moz/cpu.cpp b/media/libwebp/moz/cpu.cpp
index 39b9f3500e..5def5c2b25 100644
--- a/media/libwebp/moz/cpu.cpp
+++ b/media/libwebp/moz/cpu.cpp
@@ -4,7 +4,7 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/* This file replaces the CPU info methods originally implemented in
- * src/dsp/cpu.c, due to missing dependencies for Andriod builds. It
+ * src/dsp/cpu.c, due to missing dependencies for Android builds. It
* controls if NEON/SSE/etc is used. */
#include "../dsp/dsp.h"
diff --git a/media/libwebp/update.sh b/media/libwebp/update.sh
index 4fff43d694..9201add982 100644..100755
--- a/media/libwebp/update.sh
+++ b/media/libwebp/update.sh
@@ -21,63 +21,21 @@ cp $1/src/webp/*.h webp
mkdir -p dec
cp $1/src/dec/*.h dec
-cp $1/src/dec/alpha_dec.c dec
-cp $1/src/dec/buffer_dec.c dec
-cp $1/src/dec/frame_dec.c dec
-cp $1/src/dec/idec_dec.c dec
-cp $1/src/dec/io_dec.c dec
-cp $1/src/dec/quant_dec.c dec
-cp $1/src/dec/tree_dec.c dec
-cp $1/src/dec/vp8_dec.c dec
-cp $1/src/dec/vp8l_dec.c dec
-cp $1/src/dec/webp_dec.c dec
+cp $1/src/dec/*.c dec
mkdir -p demux
cp $1/src/demux/demux.c demux
mkdir -p dsp
cp $1/src/dsp/*.h dsp
-cp $1/src/dsp/alpha_processing.c dsp
-cp $1/src/dsp/alpha_processing_neon.c dsp
-cp $1/src/dsp/alpha_processing_sse2.c dsp
-cp $1/src/dsp/alpha_processing_sse41.c dsp
-cp $1/src/dsp/dec.c dsp
-cp $1/src/dsp/dec_clip_tables.c dsp
-cp $1/src/dsp/dec_neon.c dsp
-cp $1/src/dsp/dec_sse2.c dsp
-cp $1/src/dsp/dec_sse41.c dsp
-cp $1/src/dsp/filters.c dsp
-cp $1/src/dsp/filters_neon.c dsp
-cp $1/src/dsp/filters_sse2.c dsp
-cp $1/src/dsp/lossless.c dsp
-cp $1/src/dsp/lossless_neon.c dsp
-cp $1/src/dsp/lossless_sse2.c dsp
-cp $1/src/dsp/rescaler.c dsp
-cp $1/src/dsp/rescaler_neon.c dsp
-cp $1/src/dsp/rescaler_sse2.c dsp
-cp $1/src/dsp/upsampling.c dsp
-cp $1/src/dsp/upsampling_neon.c dsp
-cp $1/src/dsp/upsampling_sse2.c dsp
-cp $1/src/dsp/upsampling_sse41.c dsp
-cp $1/src/dsp/yuv.c dsp
-cp $1/src/dsp/yuv_neon.c dsp
-cp $1/src/dsp/yuv_sse2.c dsp
-cp $1/src/dsp/yuv_sse41.c dsp
+cp $1/src/dsp/*.c dsp
mkdir -p enc
cp $1/src/enc/*.h enc
+cp $1/src/enc/*.c enc
mkdir -p utils
cp $1/src/utils/*.h utils
-cp $1/src/utils/bit_reader_utils.c utils
-cp $1/src/utils/color_cache_utils.c utils
-cp $1/src/utils/filters_utils.c utils
-cp $1/src/utils/huffman_utils.c utils
-cp $1/src/utils/quant_levels_dec_utils.c utils
-cp $1/src/utils/quant_levels_utils.c utils
-cp $1/src/utils/random_utils.c utils
-cp $1/src/utils/rescaler_utils.c utils
-cp $1/src/utils/thread_utils.c utils
-cp $1/src/utils/utils.c utils
+cp $1/src/utils/*.c utils
find . \( -name "*.c" -o -name "*.h" \) -exec sed -i 's/#include "src\//#include "..\//g' {} \;
diff --git a/media/libwebp/utils/bit_reader_inl_utils.h b/media/libwebp/utils/bit_reader_inl_utils.h
index 8d1249ef97..78804ed8d2 100644
--- a/media/libwebp/utils/bit_reader_inl_utils.h
+++ b/media/libwebp/utils/bit_reader_inl_utils.h
@@ -55,7 +55,7 @@ void VP8LoadFinalBytes(VP8BitReader* const br);
// makes sure br->value_ has at least BITS bits worth of data
static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
-void VP8LoadNewBytes(VP8BitReader* const br) {
+void VP8LoadNewBytes(VP8BitReader* WEBP_RESTRICT const br) {
assert(br != NULL && br->buf_ != NULL);
// Read 'BITS' bits at a time if possible.
if (br->buf_ < br->buf_max_) {
@@ -104,7 +104,8 @@ void VP8LoadNewBytes(VP8BitReader* const br) {
}
// Read a bit with proba 'prob'. Speed-critical function!
-static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
+static WEBP_INLINE int VP8GetBit(VP8BitReader* WEBP_RESTRICT const br,
+ int prob, const char label[]) {
// Don't move this declaration! It makes a big speed difference to store
// 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
// alter br->range_ value.
@@ -129,13 +130,15 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
br->bits_ -= shift;
}
br->range_ = range - 1;
+ BT_TRACK(br);
return bit;
}
}
// simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
-int VP8GetSigned(VP8BitReader* const br, int v) {
+int VP8GetSigned(VP8BitReader* WEBP_RESTRICT const br, int v,
+ const char label[]) {
if (br->bits_ < 0) {
VP8LoadNewBytes(br);
}
@@ -148,11 +151,13 @@ int VP8GetSigned(VP8BitReader* const br, int v) {
br->range_ += mask;
br->range_ |= 1;
br->value_ -= (bit_t)((split + 1) & mask) << pos;
+ BT_TRACK(br);
return (v ^ mask) - mask;
}
}
-static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
+static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* WEBP_RESTRICT const br,
+ int prob, const char label[]) {
// Don't move this declaration! It makes a big speed difference to store
// 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
// alter br->range_ value.
@@ -179,6 +184,7 @@ static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
br->bits_ -= shift;
}
br->range_ = range;
+ BT_TRACK(br);
return bit;
}
}
diff --git a/media/libwebp/utils/bit_reader_utils.c b/media/libwebp/utils/bit_reader_utils.c
index a7cb193bde..1001ec445d 100644
--- a/media/libwebp/utils/bit_reader_utils.c
+++ b/media/libwebp/utils/bit_reader_utils.c
@@ -102,17 +102,18 @@ void VP8LoadFinalBytes(VP8BitReader* const br) {
//------------------------------------------------------------------------------
// Higher-level calls
-uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
+uint32_t VP8GetValue(VP8BitReader* const br, int bits, const char label[]) {
uint32_t v = 0;
while (bits-- > 0) {
- v |= VP8GetBit(br, 0x80) << bits;
+ v |= VP8GetBit(br, 0x80, label) << bits;
}
return v;
}
-int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
- const int value = VP8GetValue(br, bits);
- return VP8Get(br) ? -value : value;
+int32_t VP8GetSignedValue(VP8BitReader* const br, int bits,
+ const char label[]) {
+ const int value = VP8GetValue(br, bits, label);
+ return VP8Get(br, label) ? -value : value;
}
//------------------------------------------------------------------------------
@@ -220,3 +221,78 @@ uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
}
//------------------------------------------------------------------------------
+// Bit-tracing tool
+
+#if (BITTRACE > 0)
+
+#include <stdlib.h> // for atexit()
+#include <stdio.h>
+#include <string.h>
+
+#define MAX_NUM_LABELS 32
+static struct {
+ const char* label;
+ int size;
+ int count;
+} kLabels[MAX_NUM_LABELS];
+
+static int last_label = 0;
+static int last_pos = 0;
+static const uint8_t* buf_start = NULL;
+static int init_done = 0;
+
+static void PrintBitTraces(void) {
+ int i;
+ int scale = 1;
+ int total = 0;
+ const char* units = "bits";
+#if (BITTRACE == 2)
+ scale = 8;
+ units = "bytes";
+#endif
+ for (i = 0; i < last_label; ++i) total += kLabels[i].size;
+ if (total < 1) total = 1; // avoid rounding errors
+ printf("=== Bit traces ===\n");
+ for (i = 0; i < last_label; ++i) {
+ const int skip = 16 - (int)strlen(kLabels[i].label);
+ const int value = (kLabels[i].size + scale - 1) / scale;
+ assert(skip > 0);
+ printf("%s \%*s: %6d %s \t[%5.2f%%] [count: %7d]\n",
+ kLabels[i].label, skip, "", value, units,
+ 100.f * kLabels[i].size / total,
+ kLabels[i].count);
+ }
+ total = (total + scale - 1) / scale;
+ printf("Total: %d %s\n", total, units);
+}
+
+void BitTrace(const struct VP8BitReader* const br, const char label[]) {
+ int i, pos;
+ if (!init_done) {
+ memset(kLabels, 0, sizeof(kLabels));
+ atexit(PrintBitTraces);
+ buf_start = br->buf_;
+ init_done = 1;
+ }
+ pos = (int)(br->buf_ - buf_start) * 8 - br->bits_;
+ // if there's a too large jump, we've changed partition -> reset counter
+ if (abs(pos - last_pos) > 32) {
+ buf_start = br->buf_;
+ pos = 0;
+ last_pos = 0;
+ }
+ if (br->range_ >= 0x7f) pos += kVP8Log2Range[br->range_ - 0x7f];
+ for (i = 0; i < last_label; ++i) {
+ if (!strcmp(label, kLabels[i].label)) break;
+ }
+ if (i == MAX_NUM_LABELS) abort(); // overflow!
+ kLabels[i].label = label;
+ kLabels[i].size += pos - last_pos;
+ kLabels[i].count += 1;
+ if (i == last_label) ++last_label;
+ last_pos = pos;
+}
+
+#endif // BITTRACE > 0
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/utils/bit_reader_utils.h b/media/libwebp/utils/bit_reader_utils.h
index 377a7821ad..2df9c417cf 100644
--- a/media/libwebp/utils/bit_reader_utils.h
+++ b/media/libwebp/utils/bit_reader_utils.h
@@ -21,6 +21,27 @@
#endif
#include "../webp/types.h"
+// Warning! This macro triggers quite some MACRO wizardry around func signature!
+#if !defined(BITTRACE)
+#define BITTRACE 0 // 0 = off, 1 = print bits, 2 = print bytes
+#endif
+
+#if (BITTRACE > 0)
+struct VP8BitReader;
+extern void BitTrace(const struct VP8BitReader* const br, const char label[]);
+#define BT_TRACK(br) BitTrace(br, label)
+#define VP8Get(BR, L) VP8GetValue(BR, 1, L)
+#else
+#define BT_TRACK(br)
+// We'll REMOVE the 'const char label[]' from all signatures and calls (!!):
+#define VP8GetValue(BR, N, L) VP8GetValue(BR, N)
+#define VP8Get(BR, L) VP8GetValue(BR, 1, L)
+#define VP8GetSignedValue(BR, N, L) VP8GetSignedValue(BR, N)
+#define VP8GetBit(BR, P, L) VP8GetBit(BR, P)
+#define VP8GetBitAlt(BR, P, L) VP8GetBitAlt(BR, P)
+#define VP8GetSigned(BR, V, L) VP8GetSigned(BR, V)
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -92,17 +113,15 @@ void VP8BitReaderSetBuffer(VP8BitReader* const br,
void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset);
// return the next value made of 'num_bits' bits
-uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
-static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) {
- return VP8GetValue(br, 1);
-}
+uint32_t VP8GetValue(VP8BitReader* const br, int num_bits, const char label[]);
// return the next value with sign-extension.
-int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
+int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits,
+ const char label[]);
// bit_reader_inl.h will implement the following methods:
-// static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob)
-// static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v)
+// static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob, ...)
+// static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v, ...)
// and should be included by the .c files that actually need them.
// This is to avoid recompiling the whole library whenever this file is touched,
// and also allowing platform-specific ad-hoc hacks.
diff --git a/media/libwebp/utils/bit_writer_utils.c b/media/libwebp/utils/bit_writer_utils.c
new file mode 100644
index 0000000000..37a63946c4
--- /dev/null
+++ b/media/libwebp/utils/bit_writer_utils.c
@@ -0,0 +1,347 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Bit writing and boolean coder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+// Vikas Arora (vikaas.arora@gmail.com)
+
+#include <assert.h>
+#include <string.h> // for memcpy()
+#include <stdlib.h>
+
+#include "../utils/bit_writer_utils.h"
+#include "../utils/endian_inl_utils.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// VP8BitWriter
+
+static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
+ uint8_t* new_buf;
+ size_t new_size;
+ const uint64_t needed_size_64b = (uint64_t)bw->pos_ + extra_size;
+ const size_t needed_size = (size_t)needed_size_64b;
+ if (needed_size_64b != needed_size) {
+ bw->error_ = 1;
+ return 0;
+ }
+ if (needed_size <= bw->max_pos_) return 1;
+ // If the following line wraps over 32bit, the test just after will catch it.
+ new_size = 2 * bw->max_pos_;
+ if (new_size < needed_size) new_size = needed_size;
+ if (new_size < 1024) new_size = 1024;
+ new_buf = (uint8_t*)WebPSafeMalloc(1ULL, new_size);
+ if (new_buf == NULL) {
+ bw->error_ = 1;
+ return 0;
+ }
+ if (bw->pos_ > 0) {
+ assert(bw->buf_ != NULL);
+ memcpy(new_buf, bw->buf_, bw->pos_);
+ }
+ WebPSafeFree(bw->buf_);
+ bw->buf_ = new_buf;
+ bw->max_pos_ = new_size;
+ return 1;
+}
+
+static void Flush(VP8BitWriter* const bw) {
+ const int s = 8 + bw->nb_bits_;
+ const int32_t bits = bw->value_ >> s;
+ assert(bw->nb_bits_ >= 0);
+ bw->value_ -= bits << s;
+ bw->nb_bits_ -= 8;
+ if ((bits & 0xff) != 0xff) {
+ size_t pos = bw->pos_;
+ if (!BitWriterResize(bw, bw->run_ + 1)) {
+ return;
+ }
+ if (bits & 0x100) { // overflow -> propagate carry over pending 0xff's
+ if (pos > 0) bw->buf_[pos - 1]++;
+ }
+ if (bw->run_ > 0) {
+ const int value = (bits & 0x100) ? 0x00 : 0xff;
+ for (; bw->run_ > 0; --bw->run_) bw->buf_[pos++] = value;
+ }
+ bw->buf_[pos++] = bits & 0xff;
+ bw->pos_ = pos;
+ } else {
+ bw->run_++; // delay writing of bytes 0xff, pending eventual carry.
+ }
+}
+
+//------------------------------------------------------------------------------
+// renormalization
+
+static const uint8_t kNorm[128] = { // renorm_sizes[i] = 8 - log2(i)
+ 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0
+};
+
+// range = ((range + 1) << kVP8Log2Range[range]) - 1
+static const uint8_t kNewRange[128] = {
+ 127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
+ 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
+ 247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
+ 183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239,
+ 243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149,
+ 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179,
+ 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209,
+ 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239,
+ 241, 243, 245, 247, 249, 251, 253, 127
+};
+
+int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
+ const int split = (bw->range_ * prob) >> 8;
+ if (bit) {
+ bw->value_ += split + 1;
+ bw->range_ -= split + 1;
+ } else {
+ bw->range_ = split;
+ }
+ if (bw->range_ < 127) { // emit 'shift' bits out and renormalize
+ const int shift = kNorm[bw->range_];
+ bw->range_ = kNewRange[bw->range_];
+ bw->value_ <<= shift;
+ bw->nb_bits_ += shift;
+ if (bw->nb_bits_ > 0) Flush(bw);
+ }
+ return bit;
+}
+
+int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
+ const int split = bw->range_ >> 1;
+ if (bit) {
+ bw->value_ += split + 1;
+ bw->range_ -= split + 1;
+ } else {
+ bw->range_ = split;
+ }
+ if (bw->range_ < 127) {
+ bw->range_ = kNewRange[bw->range_];
+ bw->value_ <<= 1;
+ bw->nb_bits_ += 1;
+ if (bw->nb_bits_ > 0) Flush(bw);
+ }
+ return bit;
+}
+
+void VP8PutBits(VP8BitWriter* const bw, uint32_t value, int nb_bits) {
+ uint32_t mask;
+ assert(nb_bits > 0 && nb_bits < 32);
+ for (mask = 1u << (nb_bits - 1); mask; mask >>= 1) {
+ VP8PutBitUniform(bw, value & mask);
+ }
+}
+
+void VP8PutSignedBits(VP8BitWriter* const bw, int value, int nb_bits) {
+ if (!VP8PutBitUniform(bw, value != 0)) return;
+ if (value < 0) {
+ VP8PutBits(bw, ((-value) << 1) | 1, nb_bits + 1);
+ } else {
+ VP8PutBits(bw, value << 1, nb_bits + 1);
+ }
+}
+
+//------------------------------------------------------------------------------
+
+int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
+ bw->range_ = 255 - 1;
+ bw->value_ = 0;
+ bw->run_ = 0;
+ bw->nb_bits_ = -8;
+ bw->pos_ = 0;
+ bw->max_pos_ = 0;
+ bw->error_ = 0;
+ bw->buf_ = NULL;
+ return (expected_size > 0) ? BitWriterResize(bw, expected_size) : 1;
+}
+
+uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
+ VP8PutBits(bw, 0, 9 - bw->nb_bits_);
+ bw->nb_bits_ = 0; // pad with zeroes
+ Flush(bw);
+ return bw->buf_;
+}
+
+int VP8BitWriterAppend(VP8BitWriter* const bw,
+ const uint8_t* data, size_t size) {
+ assert(data != NULL);
+ if (bw->nb_bits_ != -8) return 0; // Flush() must have been called
+ if (!BitWriterResize(bw, size)) return 0;
+ memcpy(bw->buf_ + bw->pos_, data, size);
+ bw->pos_ += size;
+ return 1;
+}
+
+void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
+ if (bw != NULL) {
+ WebPSafeFree(bw->buf_);
+ memset(bw, 0, sizeof(*bw));
+ }
+}
+
+//------------------------------------------------------------------------------
+// VP8LBitWriter
+
+// This is the minimum amount of size the memory buffer is guaranteed to grow
+// when extra space is needed.
+#define MIN_EXTRA_SIZE (32768ULL)
+
+// Returns 1 on success.
+static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
+ uint8_t* allocated_buf;
+ size_t allocated_size;
+ const size_t max_bytes = bw->end_ - bw->buf_;
+ const size_t current_size = bw->cur_ - bw->buf_;
+ const uint64_t size_required_64b = (uint64_t)current_size + extra_size;
+ const size_t size_required = (size_t)size_required_64b;
+ if (size_required != size_required_64b) {
+ bw->error_ = 1;
+ return 0;
+ }
+ if (max_bytes > 0 && size_required <= max_bytes) return 1;
+ allocated_size = (3 * max_bytes) >> 1;
+ if (allocated_size < size_required) allocated_size = size_required;
+ // make allocated size multiple of 1k
+ allocated_size = (((allocated_size >> 10) + 1) << 10);
+ allocated_buf = (uint8_t*)WebPSafeMalloc(1ULL, allocated_size);
+ if (allocated_buf == NULL) {
+ bw->error_ = 1;
+ return 0;
+ }
+ if (current_size > 0) {
+ memcpy(allocated_buf, bw->buf_, current_size);
+ }
+ WebPSafeFree(bw->buf_);
+ bw->buf_ = allocated_buf;
+ bw->cur_ = bw->buf_ + current_size;
+ bw->end_ = bw->buf_ + allocated_size;
+ return 1;
+}
+
+int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
+ memset(bw, 0, sizeof(*bw));
+ return VP8LBitWriterResize(bw, expected_size);
+}
+
+int VP8LBitWriterClone(const VP8LBitWriter* const src,
+ VP8LBitWriter* const dst) {
+ const size_t current_size = src->cur_ - src->buf_;
+ assert(src->cur_ >= src->buf_ && src->cur_ <= src->end_);
+ if (!VP8LBitWriterResize(dst, current_size)) return 0;
+ memcpy(dst->buf_, src->buf_, current_size);
+ dst->bits_ = src->bits_;
+ dst->used_ = src->used_;
+ dst->error_ = src->error_;
+ dst->cur_ = dst->buf_ + current_size;
+ return 1;
+}
+
+void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
+ if (bw != NULL) {
+ WebPSafeFree(bw->buf_);
+ memset(bw, 0, sizeof(*bw));
+ }
+}
+
+void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
+ VP8LBitWriter* const bw) {
+ bw->bits_ = bw_init->bits_;
+ bw->used_ = bw_init->used_;
+ bw->cur_ = bw->buf_ + (bw_init->cur_ - bw_init->buf_);
+ assert(bw->cur_ <= bw->end_);
+ bw->error_ = bw_init->error_;
+}
+
+void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst) {
+ const VP8LBitWriter tmp = *src;
+ *src = *dst;
+ *dst = tmp;
+}
+
+void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
+ // If needed, make some room by flushing some bits out.
+ if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+ const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+ if (!CheckSizeOverflow(extra_size) ||
+ !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+ bw->cur_ = bw->buf_;
+ bw->error_ = 1;
+ return;
+ }
+ }
+ *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)bw->bits_);
+ bw->cur_ += VP8L_WRITER_BYTES;
+ bw->bits_ >>= VP8L_WRITER_BITS;
+ bw->used_ -= VP8L_WRITER_BITS;
+}
+
+void VP8LPutBitsInternal(VP8LBitWriter* const bw, uint32_t bits, int n_bits) {
+ assert(n_bits <= 32);
+ // That's the max we can handle:
+ assert(sizeof(vp8l_wtype_t) == 2);
+ if (n_bits > 0) {
+ vp8l_atype_t lbits = bw->bits_;
+ int used = bw->used_;
+ // Special case of overflow handling for 32bit accumulator (2-steps flush).
+#if VP8L_WRITER_BITS == 16
+ if (used + n_bits >= VP8L_WRITER_MAX_BITS) {
+ // Fill up all the VP8L_WRITER_MAX_BITS so it can be flushed out below.
+ const int shift = VP8L_WRITER_MAX_BITS - used;
+ lbits |= (vp8l_atype_t)bits << used;
+ used = VP8L_WRITER_MAX_BITS;
+ n_bits -= shift;
+ bits >>= shift;
+ assert(n_bits <= VP8L_WRITER_MAX_BITS);
+ }
+#endif
+ // If needed, make some room by flushing some bits out.
+ while (used >= VP8L_WRITER_BITS) {
+ if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+ const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+ if (!CheckSizeOverflow(extra_size) ||
+ !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+ bw->cur_ = bw->buf_;
+ bw->error_ = 1;
+ return;
+ }
+ }
+ *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)lbits);
+ bw->cur_ += VP8L_WRITER_BYTES;
+ lbits >>= VP8L_WRITER_BITS;
+ used -= VP8L_WRITER_BITS;
+ }
+ bw->bits_ = lbits | ((vp8l_atype_t)bits << used);
+ bw->used_ = used + n_bits;
+ }
+}
+
+uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
+ // flush leftover bits
+ if (VP8LBitWriterResize(bw, (bw->used_ + 7) >> 3)) {
+ while (bw->used_ > 0) {
+ *bw->cur_++ = (uint8_t)bw->bits_;
+ bw->bits_ >>= 8;
+ bw->used_ -= 8;
+ }
+ bw->used_ = 0;
+ }
+ return bw->buf_;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/utils/color_cache_utils.c b/media/libwebp/utils/color_cache_utils.c
index c5eb0d8a90..c9e212df53 100644
--- a/media/libwebp/utils/color_cache_utils.c
+++ b/media/libwebp/utils/color_cache_utils.c
@@ -20,22 +20,22 @@
//------------------------------------------------------------------------------
// VP8LColorCache.
-int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
+int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits) {
const int hash_size = 1 << hash_bits;
- assert(cc != NULL);
+ assert(color_cache != NULL);
assert(hash_bits > 0);
- cc->colors_ = (uint32_t*)WebPSafeCalloc((uint64_t)hash_size,
- sizeof(*cc->colors_));
- if (cc->colors_ == NULL) return 0;
- cc->hash_shift_ = 32 - hash_bits;
- cc->hash_bits_ = hash_bits;
+ color_cache->colors_ = (uint32_t*)WebPSafeCalloc(
+ (uint64_t)hash_size, sizeof(*color_cache->colors_));
+ if (color_cache->colors_ == NULL) return 0;
+ color_cache->hash_shift_ = 32 - hash_bits;
+ color_cache->hash_bits_ = hash_bits;
return 1;
}
-void VP8LColorCacheClear(VP8LColorCache* const cc) {
- if (cc != NULL) {
- WebPSafeFree(cc->colors_);
- cc->colors_ = NULL;
+void VP8LColorCacheClear(VP8LColorCache* const color_cache) {
+ if (color_cache != NULL) {
+ WebPSafeFree(color_cache->colors_);
+ color_cache->colors_ = NULL;
}
}
diff --git a/media/libwebp/utils/color_cache_utils.h b/media/libwebp/utils/color_cache_utils.h
index c46131277d..34e9e68795 100644
--- a/media/libwebp/utils/color_cache_utils.h
+++ b/media/libwebp/utils/color_cache_utils.h
@@ -17,6 +17,7 @@
#include <assert.h>
+#include "../dsp/dsp.h"
#include "../webp/types.h"
#ifdef __cplusplus
@@ -25,15 +26,16 @@ extern "C" {
// Main color cache struct.
typedef struct {
- uint32_t *colors_; // color entries
+ uint32_t* colors_; // color entries
int hash_shift_; // Hash shift: 32 - hash_bits_.
int hash_bits_;
} VP8LColorCache;
-static const uint64_t kHashMul = 0x1e35a7bdull;
+static const uint32_t kHashMul = 0x1e35a7bdu;
-static WEBP_INLINE int VP8LHashPix(uint32_t argb, int shift) {
- return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+int VP8LHashPix(uint32_t argb, int shift) {
+ return (int)((argb * kHashMul) >> shift);
}
static WEBP_INLINE uint32_t VP8LColorCacheLookup(
diff --git a/media/libwebp/utils/huffman_encode_utils.c b/media/libwebp/utils/huffman_encode_utils.c
new file mode 100644
index 0000000000..8219cfc168
--- /dev/null
+++ b/media/libwebp/utils/huffman_encode_utils.c
@@ -0,0 +1,416 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+// Entropy encoding (Huffman) for webp lossless.
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../utils/huffman_encode_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// -----------------------------------------------------------------------------
+// Util function to optimize the symbol map for RLE coding
+
+// Heuristics for selecting the stride ranges to collapse.
+static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
+ return abs(a - b) < 4;
+}
+
+// Change the population counts in a way that the consequent
+// Huffman tree compression, especially its RLE-part, give smaller output.
+static void OptimizeHuffmanForRle(int length, uint8_t* const good_for_rle,
+ uint32_t* const counts) {
+ // 1) Let's make the Huffman code more compatible with rle encoding.
+ int i;
+ for (; length >= 0; --length) {
+ if (length == 0) {
+ return; // All zeros.
+ }
+ if (counts[length - 1] != 0) {
+ // Now counts[0..length - 1] does not have trailing zeros.
+ break;
+ }
+ }
+ // 2) Let's mark all population counts that already can be encoded
+ // with an rle code.
+ {
+ // Let's not spoil any of the existing good rle codes.
+ // Mark any seq of 0's that is longer as 5 as a good_for_rle.
+ // Mark any seq of non-0's that is longer as 7 as a good_for_rle.
+ uint32_t symbol = counts[0];
+ int stride = 0;
+ for (i = 0; i < length + 1; ++i) {
+ if (i == length || counts[i] != symbol) {
+ if ((symbol == 0 && stride >= 5) ||
+ (symbol != 0 && stride >= 7)) {
+ int k;
+ for (k = 0; k < stride; ++k) {
+ good_for_rle[i - k - 1] = 1;
+ }
+ }
+ stride = 1;
+ if (i != length) {
+ symbol = counts[i];
+ }
+ } else {
+ ++stride;
+ }
+ }
+ }
+ // 3) Let's replace those population counts that lead to more rle codes.
+ {
+ uint32_t stride = 0;
+ uint32_t limit = counts[0];
+ uint32_t sum = 0;
+ for (i = 0; i < length + 1; ++i) {
+ if (i == length || good_for_rle[i] ||
+ (i != 0 && good_for_rle[i - 1]) ||
+ !ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) {
+ if (stride >= 4 || (stride >= 3 && sum == 0)) {
+ uint32_t k;
+ // The stride must end, collapse what we have, if we have enough (4).
+ uint32_t count = (sum + stride / 2) / stride;
+ if (count < 1) {
+ count = 1;
+ }
+ if (sum == 0) {
+ // Don't make an all zeros stride to be upgraded to ones.
+ count = 0;
+ }
+ for (k = 0; k < stride; ++k) {
+ // We don't want to change value at counts[i],
+ // that is already belonging to the next stride. Thus - 1.
+ counts[i - k - 1] = count;
+ }
+ }
+ stride = 0;
+ sum = 0;
+ if (i < length - 3) {
+ // All interesting strides have a count of at least 4,
+ // at least when non-zeros.
+ limit = (counts[i] + counts[i + 1] +
+ counts[i + 2] + counts[i + 3] + 2) / 4;
+ } else if (i < length) {
+ limit = counts[i];
+ } else {
+ limit = 0;
+ }
+ }
+ ++stride;
+ if (i != length) {
+ sum += counts[i];
+ if (stride >= 4) {
+ limit = (sum + stride / 2) / stride;
+ }
+ }
+ }
+ }
+}
+
+// A comparer function for two Huffman trees: sorts first by 'total count'
+// (more comes first), and then by 'value' (more comes first).
+static int CompareHuffmanTrees(const void* ptr1, const void* ptr2) {
+ const HuffmanTree* const t1 = (const HuffmanTree*)ptr1;
+ const HuffmanTree* const t2 = (const HuffmanTree*)ptr2;
+ if (t1->total_count_ > t2->total_count_) {
+ return -1;
+ } else if (t1->total_count_ < t2->total_count_) {
+ return 1;
+ } else {
+ assert(t1->value_ != t2->value_);
+ return (t1->value_ < t2->value_) ? -1 : 1;
+ }
+}
+
+static void SetBitDepths(const HuffmanTree* const tree,
+ const HuffmanTree* const pool,
+ uint8_t* const bit_depths, int level) {
+ if (tree->pool_index_left_ >= 0) {
+ SetBitDepths(&pool[tree->pool_index_left_], pool, bit_depths, level + 1);
+ SetBitDepths(&pool[tree->pool_index_right_], pool, bit_depths, level + 1);
+ } else {
+ bit_depths[tree->value_] = level;
+ }
+}
+
+// Create an optimal Huffman tree.
+//
+// (data,length): population counts.
+// tree_limit: maximum bit depth (inclusive) of the codes.
+// bit_depths[]: how many bits are used for the symbol.
+//
+// Returns 0 when an error has occurred.
+//
+// The catch here is that the tree cannot be arbitrarily deep
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See https://en.wikipedia.org/wiki/Huffman_coding
+static void GenerateOptimalTree(const uint32_t* const histogram,
+ int histogram_size,
+ HuffmanTree* tree, int tree_depth_limit,
+ uint8_t* const bit_depths) {
+ uint32_t count_min;
+ HuffmanTree* tree_pool;
+ int tree_size_orig = 0;
+ int i;
+
+ for (i = 0; i < histogram_size; ++i) {
+ if (histogram[i] != 0) {
+ ++tree_size_orig;
+ }
+ }
+
+ if (tree_size_orig == 0) { // pretty optimal already!
+ return;
+ }
+
+ tree_pool = tree + tree_size_orig;
+
+ // For block sizes with less than 64k symbols we never need to do a
+ // second iteration of this loop.
+ // If we actually start running inside this loop a lot, we would perhaps
+ // be better off with the Katajainen algorithm.
+ assert(tree_size_orig <= (1 << (tree_depth_limit - 1)));
+ for (count_min = 1; ; count_min *= 2) {
+ int tree_size = tree_size_orig;
+ // We need to pack the Huffman tree in tree_depth_limit bits.
+ // So, we try by faking histogram entries to be at least 'count_min'.
+ int idx = 0;
+ int j;
+ for (j = 0; j < histogram_size; ++j) {
+ if (histogram[j] != 0) {
+ const uint32_t count =
+ (histogram[j] < count_min) ? count_min : histogram[j];
+ tree[idx].total_count_ = count;
+ tree[idx].value_ = j;
+ tree[idx].pool_index_left_ = -1;
+ tree[idx].pool_index_right_ = -1;
+ ++idx;
+ }
+ }
+
+ // Build the Huffman tree.
+ qsort(tree, tree_size, sizeof(*tree), CompareHuffmanTrees);
+
+ if (tree_size > 1) { // Normal case.
+ int tree_pool_size = 0;
+ while (tree_size > 1) { // Finish when we have only one root.
+ uint32_t count;
+ tree_pool[tree_pool_size++] = tree[tree_size - 1];
+ tree_pool[tree_pool_size++] = tree[tree_size - 2];
+ count = tree_pool[tree_pool_size - 1].total_count_ +
+ tree_pool[tree_pool_size - 2].total_count_;
+ tree_size -= 2;
+ {
+ // Search for the insertion point.
+ int k;
+ for (k = 0; k < tree_size; ++k) {
+ if (tree[k].total_count_ <= count) {
+ break;
+ }
+ }
+ memmove(tree + (k + 1), tree + k, (tree_size - k) * sizeof(*tree));
+ tree[k].total_count_ = count;
+ tree[k].value_ = -1;
+
+ tree[k].pool_index_left_ = tree_pool_size - 1;
+ tree[k].pool_index_right_ = tree_pool_size - 2;
+ tree_size = tree_size + 1;
+ }
+ }
+ SetBitDepths(&tree[0], tree_pool, bit_depths, 0);
+ } else if (tree_size == 1) { // Trivial case: only one element.
+ bit_depths[tree[0].value_] = 1;
+ }
+
+ {
+ // Test if this Huffman tree satisfies our 'tree_depth_limit' criteria.
+ int max_depth = bit_depths[0];
+ for (j = 1; j < histogram_size; ++j) {
+ if (max_depth < bit_depths[j]) {
+ max_depth = bit_depths[j];
+ }
+ }
+ if (max_depth <= tree_depth_limit) {
+ break;
+ }
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Coding of the Huffman tree values
+
+static HuffmanTreeToken* CodeRepeatedValues(int repetitions,
+ HuffmanTreeToken* tokens,
+ int value, int prev_value) {
+ assert(value <= MAX_ALLOWED_CODE_LENGTH);
+ if (value != prev_value) {
+ tokens->code = value;
+ tokens->extra_bits = 0;
+ ++tokens;
+ --repetitions;
+ }
+ while (repetitions >= 1) {
+ if (repetitions < 3) {
+ int i;
+ for (i = 0; i < repetitions; ++i) {
+ tokens->code = value;
+ tokens->extra_bits = 0;
+ ++tokens;
+ }
+ break;
+ } else if (repetitions < 7) {
+ tokens->code = 16;
+ tokens->extra_bits = repetitions - 3;
+ ++tokens;
+ break;
+ } else {
+ tokens->code = 16;
+ tokens->extra_bits = 3;
+ ++tokens;
+ repetitions -= 6;
+ }
+ }
+ return tokens;
+}
+
+static HuffmanTreeToken* CodeRepeatedZeros(int repetitions,
+ HuffmanTreeToken* tokens) {
+ while (repetitions >= 1) {
+ if (repetitions < 3) {
+ int i;
+ for (i = 0; i < repetitions; ++i) {
+ tokens->code = 0; // 0-value
+ tokens->extra_bits = 0;
+ ++tokens;
+ }
+ break;
+ } else if (repetitions < 11) {
+ tokens->code = 17;
+ tokens->extra_bits = repetitions - 3;
+ ++tokens;
+ break;
+ } else if (repetitions < 139) {
+ tokens->code = 18;
+ tokens->extra_bits = repetitions - 11;
+ ++tokens;
+ break;
+ } else {
+ tokens->code = 18;
+ tokens->extra_bits = 0x7f; // 138 repeated 0s
+ ++tokens;
+ repetitions -= 138;
+ }
+ }
+ return tokens;
+}
+
+int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
+ HuffmanTreeToken* tokens, int max_tokens) {
+ HuffmanTreeToken* const starting_token = tokens;
+ HuffmanTreeToken* const ending_token = tokens + max_tokens;
+ const int depth_size = tree->num_symbols;
+ int prev_value = 8; // 8 is the initial value for rle.
+ int i = 0;
+ assert(tokens != NULL);
+ while (i < depth_size) {
+ const int value = tree->code_lengths[i];
+ int k = i + 1;
+ int runs;
+ while (k < depth_size && tree->code_lengths[k] == value) ++k;
+ runs = k - i;
+ if (value == 0) {
+ tokens = CodeRepeatedZeros(runs, tokens);
+ } else {
+ tokens = CodeRepeatedValues(runs, tokens, value, prev_value);
+ prev_value = value;
+ }
+ i += runs;
+ assert(tokens <= ending_token);
+ }
+ (void)ending_token; // suppress 'unused variable' warning
+ return (int)(tokens - starting_token);
+}
+
+// -----------------------------------------------------------------------------
+
+// Pre-reversed 4-bit values.
+static const uint8_t kReversedBits[16] = {
+ 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+ 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
+};
+
+static uint32_t ReverseBits(int num_bits, uint32_t bits) {
+ uint32_t retval = 0;
+ int i = 0;
+ while (i < num_bits) {
+ i += 4;
+ retval |= kReversedBits[bits & 0xf] << (MAX_ALLOWED_CODE_LENGTH + 1 - i);
+ bits >>= 4;
+ }
+ retval >>= (MAX_ALLOWED_CODE_LENGTH + 1 - num_bits);
+ return retval;
+}
+
+// Get the actual bit values for a tree of bit depths.
+static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) {
+ // 0 bit-depth means that the symbol does not exist.
+ int i;
+ int len;
+ uint32_t next_code[MAX_ALLOWED_CODE_LENGTH + 1];
+ int depth_count[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
+
+ assert(tree != NULL);
+ len = tree->num_symbols;
+ for (i = 0; i < len; ++i) {
+ const int code_length = tree->code_lengths[i];
+ assert(code_length <= MAX_ALLOWED_CODE_LENGTH);
+ ++depth_count[code_length];
+ }
+ depth_count[0] = 0; // ignore unused symbol
+ next_code[0] = 0;
+ {
+ uint32_t code = 0;
+ for (i = 1; i <= MAX_ALLOWED_CODE_LENGTH; ++i) {
+ code = (code + depth_count[i - 1]) << 1;
+ next_code[i] = code;
+ }
+ }
+ for (i = 0; i < len; ++i) {
+ const int code_length = tree->code_lengths[i];
+ tree->codes[i] = ReverseBits(code_length, next_code[code_length]++);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Main entry point
+
+void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
+ uint8_t* const buf_rle, HuffmanTree* const huff_tree,
+ HuffmanTreeCode* const huff_code) {
+ const int num_symbols = huff_code->num_symbols;
+ memset(buf_rle, 0, num_symbols * sizeof(*buf_rle));
+ OptimizeHuffmanForRle(num_symbols, buf_rle, histogram);
+ GenerateOptimalTree(histogram, num_symbols, huff_tree, tree_depth_limit,
+ huff_code->code_lengths);
+ // Create the actual bit codes for the bit lengths.
+ ConvertBitDepthsToSymbols(huff_code);
+}
diff --git a/media/libwebp/utils/huffman_encode_utils.h b/media/libwebp/utils/huffman_encode_utils.h
index 236f266e4d..892d514751 100644
--- a/media/libwebp/utils/huffman_encode_utils.h
+++ b/media/libwebp/utils/huffman_encode_utils.h
@@ -51,7 +51,7 @@ int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
// huffman code tree.
void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
uint8_t* const buf_rle, HuffmanTree* const huff_tree,
- HuffmanTreeCode* const tree);
+ HuffmanTreeCode* const huff_code);
#ifdef __cplusplus
}
diff --git a/media/libwebp/utils/huffman_utils.c b/media/libwebp/utils/huffman_utils.c
index a2b4b3f897..8e6954f196 100644
--- a/media/libwebp/utils/huffman_utils.c
+++ b/media/libwebp/utils/huffman_utils.c
@@ -91,7 +91,8 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
assert(code_lengths_size != 0);
assert(code_lengths != NULL);
- assert(root_table != NULL);
+ assert((root_table != NULL && sorted != NULL) ||
+ (root_table == NULL && sorted == NULL));
assert(root_bits > 0);
// Build histogram of code lengths.
@@ -120,16 +121,22 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
for (symbol = 0; symbol < code_lengths_size; ++symbol) {
const int symbol_code_length = code_lengths[symbol];
if (code_lengths[symbol] > 0) {
- sorted[offset[symbol_code_length]++] = symbol;
+ if (sorted != NULL) {
+ sorted[offset[symbol_code_length]++] = symbol;
+ } else {
+ offset[symbol_code_length]++;
+ }
}
}
// Special case code with only one value.
if (offset[MAX_ALLOWED_CODE_LENGTH] == 1) {
- HuffmanCode code;
- code.bits = 0;
- code.value = (uint16_t)sorted[0];
- ReplicateValue(table, 1, total_size, code);
+ if (sorted != NULL) {
+ HuffmanCode code;
+ code.bits = 0;
+ code.value = (uint16_t)sorted[0];
+ ReplicateValue(table, 1, total_size, code);
+ }
return total_size;
}
@@ -151,6 +158,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
if (num_open < 0) {
return 0;
}
+ if (root_table == NULL) continue;
for (; count[len] > 0; --count[len]) {
HuffmanCode code;
code.bits = (uint8_t)len;
@@ -169,6 +177,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
if (num_open < 0) {
return 0;
}
+ if (root_table == NULL) continue;
for (; count[len] > 0; --count[len]) {
HuffmanCode code;
if ((key & mask) != low) {
@@ -206,7 +215,10 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
const int code_lengths[], int code_lengths_size) {
int total_size;
assert(code_lengths_size <= MAX_CODE_LENGTHS_SIZE);
- if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
+ if (root_table == NULL) {
+ total_size = BuildHuffmanTable(NULL, root_bits,
+ code_lengths, code_lengths_size, NULL);
+ } else if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
// use local stack-allocated array.
uint16_t sorted[SORTED_SIZE_CUTOFF];
total_size = BuildHuffmanTable(root_table, root_bits,
diff --git a/media/libwebp/utils/huffman_utils.h b/media/libwebp/utils/huffman_utils.h
index 7f241aab97..4f54691d20 100644
--- a/media/libwebp/utils/huffman_utils.h
+++ b/media/libwebp/utils/huffman_utils.h
@@ -78,6 +78,8 @@ void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups);
// the huffman table.
// Returns built table size or 0 in case of error (invalid tree or
// memory error).
+// If root_table is NULL, it returns 0 if a lookup cannot be built, something
+// > 0 otherwise (but not the table size).
int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
const int code_lengths[], int code_lengths_size);
diff --git a/media/libwebp/utils/moz.build b/media/libwebp/utils/moz.build
index 619eaee6df..32431a9f3b 100644
--- a/media/libwebp/utils/moz.build
+++ b/media/libwebp/utils/moz.build
@@ -8,8 +8,10 @@ with Files('**'):
SOURCES += [
'bit_reader_utils.c',
+ 'bit_writer_utils.c',
'color_cache_utils.c',
'filters_utils.c',
+ 'huffman_encode_utils.c',
'huffman_utils.c',
'quant_levels_dec_utils.c',
'quant_levels_utils.c',
diff --git a/media/libwebp/utils/quant_levels_dec_utils.c b/media/libwebp/utils/quant_levels_dec_utils.c
index a60de3444e..f960a8aa83 100644
--- a/media/libwebp/utils/quant_levels_dec_utils.c
+++ b/media/libwebp/utils/quant_levels_dec_utils.c
@@ -30,7 +30,7 @@
#define DFIX 4 // extra precision for ordered dithering
#define DSIZE 4 // dithering size (must be a power of two)
-// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+// cf. https://en.wikipedia.org/wiki/Ordered_dithering
static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
{ 0, 8, 2, 10 }, // coefficients are in DFIX fixed-point precision
{ 12, 4, 14, 6 },
diff --git a/media/libwebp/utils/rescaler_utils.c b/media/libwebp/utils/rescaler_utils.c
index 6e384f5078..3b64e27525 100644
--- a/media/libwebp/utils/rescaler_utils.c
+++ b/media/libwebp/utils/rescaler_utils.c
@@ -12,66 +12,74 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
+#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include "../dsp/dsp.h"
#include "../utils/rescaler_utils.h"
+#include "../utils/utils.h"
//------------------------------------------------------------------------------
-void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
- uint8_t* const dst,
- int dst_width, int dst_height, int dst_stride,
- int num_channels, rescaler_t* const work) {
+int WebPRescalerInit(WebPRescaler* const rescaler,
+ int src_width, int src_height,
+ uint8_t* const dst,
+ int dst_width, int dst_height, int dst_stride,
+ int num_channels, rescaler_t* const work) {
const int x_add = src_width, x_sub = dst_width;
const int y_add = src_height, y_sub = dst_height;
- wrk->x_expand = (src_width < dst_width);
- wrk->y_expand = (src_height < dst_height);
- wrk->src_width = src_width;
- wrk->src_height = src_height;
- wrk->dst_width = dst_width;
- wrk->dst_height = dst_height;
- wrk->src_y = 0;
- wrk->dst_y = 0;
- wrk->dst = dst;
- wrk->dst_stride = dst_stride;
- wrk->num_channels = num_channels;
+ const uint64_t total_size = 2ull * dst_width * num_channels * sizeof(*work);
+ if (!CheckSizeOverflow(total_size)) return 0;
+
+ rescaler->x_expand = (src_width < dst_width);
+ rescaler->y_expand = (src_height < dst_height);
+ rescaler->src_width = src_width;
+ rescaler->src_height = src_height;
+ rescaler->dst_width = dst_width;
+ rescaler->dst_height = dst_height;
+ rescaler->src_y = 0;
+ rescaler->dst_y = 0;
+ rescaler->dst = dst;
+ rescaler->dst_stride = dst_stride;
+ rescaler->num_channels = num_channels;
// for 'x_expand', we use bilinear interpolation
- wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
- wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
- if (!wrk->x_expand) { // fx_scale is not used otherwise
- wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
+ rescaler->x_add = rescaler->x_expand ? (x_sub - 1) : x_add;
+ rescaler->x_sub = rescaler->x_expand ? (x_add - 1) : x_sub;
+ if (!rescaler->x_expand) { // fx_scale is not used otherwise
+ rescaler->fx_scale = WEBP_RESCALER_FRAC(1, rescaler->x_sub);
}
// vertical scaling parameters
- wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
- wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
- wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
- if (!wrk->y_expand) {
+ rescaler->y_add = rescaler->y_expand ? y_add - 1 : y_add;
+ rescaler->y_sub = rescaler->y_expand ? y_sub - 1 : y_sub;
+ rescaler->y_accum = rescaler->y_expand ? rescaler->y_sub : rescaler->y_add;
+ if (!rescaler->y_expand) {
// This is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
- // Its value is <= WEBP_RESCALER_ONE, because dst_height <= wrk->y_add, and
- // wrk->x_add >= 1;
- const uint64_t ratio =
- (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
+ // Its value is <= WEBP_RESCALER_ONE, because dst_height <= rescaler->y_add
+ // and rescaler->x_add >= 1;
+ const uint64_t num = (uint64_t)dst_height * WEBP_RESCALER_ONE;
+ const uint64_t den = (uint64_t)rescaler->x_add * rescaler->y_add;
+ const uint64_t ratio = num / den;
if (ratio != (uint32_t)ratio) {
// When ratio == WEBP_RESCALER_ONE, we can't represent the ratio with the
// current fixed-point precision. This happens when src_height ==
- // wrk->y_add (which == src_height), and wrk->x_add == 1.
+ // rescaler->y_add (which == src_height), and rescaler->x_add == 1.
// => We special-case fxy_scale = 0, in WebPRescalerExportRow().
- wrk->fxy_scale = 0;
+ rescaler->fxy_scale = 0;
} else {
- wrk->fxy_scale = (uint32_t)ratio;
+ rescaler->fxy_scale = (uint32_t)ratio;
}
- wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
+ rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->y_sub);
} else {
- wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
- // wrk->fxy_scale is unused here.
+ rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->x_add);
+ // rescaler->fxy_scale is unused here.
}
- wrk->irow = work;
- wrk->frow = work + num_channels * dst_width;
- memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+ rescaler->irow = work;
+ rescaler->frow = work + num_channels * dst_width;
+ memset(work, 0, (size_t)total_size);
WebPRescalerDspInit();
+ return 1;
}
int WebPRescalerGetScaledDimensions(int src_width, int src_height,
@@ -82,19 +90,20 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
{
int width = *scaled_width;
int height = *scaled_height;
+ const int max_size = INT_MAX / 2;
// if width is unspecified, scale original proportionally to height ratio.
- if (width == 0) {
+ if (width == 0 && src_height > 0) {
width =
- (int)(((uint64_t)src_width * height + src_height / 2) / src_height);
+ (int)(((uint64_t)src_width * height + src_height - 1) / src_height);
}
// if height is unspecified, scale original proportionally to width ratio.
- if (height == 0) {
+ if (height == 0 && src_width > 0) {
height =
- (int)(((uint64_t)src_height * width + src_width / 2) / src_width);
+ (int)(((uint64_t)src_height * width + src_width - 1) / src_width);
}
// Check if the overall dimensions still make sense.
- if (width <= 0 || height <= 0) {
+ if (width <= 0 || height <= 0 || width > max_size || height > max_size) {
return 0;
}
@@ -107,31 +116,34 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
//------------------------------------------------------------------------------
// all-in-one calls
-int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
- const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
+int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
+ int max_num_lines) {
+ const int num_lines =
+ (rescaler->y_accum + rescaler->y_sub - 1) / rescaler->y_sub;
return (num_lines > max_num_lines) ? max_num_lines : num_lines;
}
-int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
+int WebPRescalerImport(WebPRescaler* const rescaler, int num_lines,
const uint8_t* src, int src_stride) {
int total_imported = 0;
- while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
- if (wrk->y_expand) {
- rescaler_t* const tmp = wrk->irow;
- wrk->irow = wrk->frow;
- wrk->frow = tmp;
+ while (total_imported < num_lines &&
+ !WebPRescalerHasPendingOutput(rescaler)) {
+ if (rescaler->y_expand) {
+ rescaler_t* const tmp = rescaler->irow;
+ rescaler->irow = rescaler->frow;
+ rescaler->frow = tmp;
}
- WebPRescalerImportRow(wrk, src);
- if (!wrk->y_expand) { // Accumulate the contribution of the new row.
+ WebPRescalerImportRow(rescaler, src);
+ if (!rescaler->y_expand) { // Accumulate the contribution of the new row.
int x;
- for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
- wrk->irow[x] += wrk->frow[x];
+ for (x = 0; x < rescaler->num_channels * rescaler->dst_width; ++x) {
+ rescaler->irow[x] += rescaler->frow[x];
}
}
- ++wrk->src_y;
+ ++rescaler->src_y;
src += src_stride;
++total_imported;
- wrk->y_accum -= wrk->y_sub;
+ rescaler->y_accum -= rescaler->y_sub;
}
return total_imported;
}
diff --git a/media/libwebp/utils/rescaler_utils.h b/media/libwebp/utils/rescaler_utils.h
index b5d176ecf2..1c1942ddf5 100644
--- a/media/libwebp/utils/rescaler_utils.h
+++ b/media/libwebp/utils/rescaler_utils.h
@@ -47,12 +47,13 @@ struct WebPRescaler {
};
// Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
-void WebPRescalerInit(WebPRescaler* const rescaler,
- int src_width, int src_height,
- uint8_t* const dst,
- int dst_width, int dst_height, int dst_stride,
- int num_channels,
- rescaler_t* const work);
+// Returns false in case of error.
+int WebPRescalerInit(WebPRescaler* const rescaler,
+ int src_width, int src_height,
+ uint8_t* const dst,
+ int dst_width, int dst_height, int dst_stride,
+ int num_channels,
+ rescaler_t* const work);
// If either 'scaled_width' or 'scaled_height' (but not both) is 0 the value
// will be calculated preserving the aspect ratio, otherwise the values are
diff --git a/media/libwebp/utils/thread_utils.c b/media/libwebp/utils/thread_utils.c
index e87ffbeac4..d61e89d59d 100644
--- a/media/libwebp/utils/thread_utils.c
+++ b/media/libwebp/utils/thread_utils.c
@@ -73,7 +73,7 @@ typedef struct {
#endif
static int pthread_create(pthread_t* const thread, const void* attr,
- unsigned int (__stdcall *start)(void*), void* arg) {
+ unsigned int (__stdcall* start)(void*), void* arg) {
(void)attr;
#ifdef USE_CREATE_THREAD
*thread = CreateThread(NULL, /* lpThreadAttributes */
@@ -217,8 +217,12 @@ static THREADFN ThreadLoop(void* ptr) {
done = 1;
}
// signal to the main thread that we're done (for Sync())
- pthread_cond_signal(&impl->condition_);
+ // Note the associated mutex does not need to be held when signaling the
+ // condition. Unlocking the mutex first may improve performance in some
+ // implementations, avoiding the case where the waiting thread can't
+ // reacquire the mutex when woken.
pthread_mutex_unlock(&impl->mutex_);
+ pthread_cond_signal(&impl->condition_);
}
return THREAD_RETURN(NULL); // Thread is finished
}
@@ -240,7 +244,13 @@ static void ChangeState(WebPWorker* const worker, WebPWorkerStatus new_status) {
// assign new status and release the working thread if needed
if (new_status != OK) {
worker->status_ = new_status;
+ // Note the associated mutex does not need to be held when signaling the
+ // condition. Unlocking the mutex first may improve performance in some
+ // implementations, avoiding the case where the waiting thread can't
+ // reacquire the mutex when woken.
+ pthread_mutex_unlock(&impl->mutex_);
pthread_cond_signal(&impl->condition_);
+ return;
}
}
pthread_mutex_unlock(&impl->mutex_);
diff --git a/media/libwebp/utils/utils.c b/media/libwebp/utils/utils.c
index 9bda6a7169..97713d2832 100644
--- a/media/libwebp/utils/utils.c
+++ b/media/libwebp/utils/utils.c
@@ -23,7 +23,7 @@
// alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
// and not multi-thread safe!).
// An interesting alternative is valgrind's 'massif' tool:
-// http://valgrind.org/docs/manual/ms-manual.html
+// https://valgrind.org/docs/manual/ms-manual.html
// Here is an example command line:
/* valgrind --tool=massif --massif-out-file=massif.out \
--stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
@@ -101,6 +101,9 @@ static void Increment(int* const v) {
#if defined(MALLOC_LIMIT)
{
const char* const malloc_limit_str = getenv("MALLOC_LIMIT");
+#if MALLOC_LIMIT > 1
+ mem_limit = (size_t)MALLOC_LIMIT;
+#endif
if (malloc_limit_str != NULL) {
mem_limit = atoi(malloc_limit_str);
}
@@ -169,16 +172,16 @@ static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
const uint64_t total_size = nmemb * size;
if (nmemb == 0) return 1;
if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
- if (total_size != (size_t)total_size) return 0;
+ if (!CheckSizeOverflow(total_size)) return 0;
#if defined(PRINT_MEM_INFO) && defined(MALLOC_FAIL_AT)
if (countdown_to_fail > 0 && --countdown_to_fail == 0) {
return 0; // fake fail!
}
#endif
-#if defined(MALLOC_LIMIT)
+#if defined(PRINT_MEM_INFO) && defined(MALLOC_LIMIT)
if (mem_limit > 0) {
const uint64_t new_total_mem = (uint64_t)total_mem + total_size;
- if (new_total_mem != (size_t)new_total_mem ||
+ if (!CheckSizeOverflow(new_total_mem) ||
new_total_mem > mem_limit) {
return 0; // fake fail!
}
@@ -216,9 +219,14 @@ void WebPSafeFree(void* const ptr) {
free(ptr);
}
-// Public API function.
+// Public API functions.
+
+void* WebPMalloc(size_t size) {
+ return WebPSafeMalloc(1, size);
+}
+
void WebPFree(void* ptr) {
- free(ptr);
+ WebPSafeFree(ptr);
}
//------------------------------------------------------------------------------
@@ -226,7 +234,7 @@ void WebPFree(void* ptr) {
void WebPCopyPlane(const uint8_t* src, int src_stride,
uint8_t* dst, int dst_stride, int width, int height) {
assert(src != NULL && dst != NULL);
- assert(src_stride >= width && dst_stride >= width);
+ assert(abs(src_stride) >= width && abs(dst_stride) >= width);
while (height-- > 0) {
memcpy(dst, src, width);
src += src_stride;
diff --git a/media/libwebp/utils/utils.h b/media/libwebp/utils/utils.h
index d22151b0fc..20abf03c69 100644
--- a/media/libwebp/utils/utils.h
+++ b/media/libwebp/utils/utils.h
@@ -42,6 +42,10 @@ extern "C" {
#endif
#endif // WEBP_MAX_ALLOCABLE_MEMORY
+static WEBP_INLINE int CheckSizeOverflow(uint64_t size) {
+ return size == (size_t)size;
+}
+
// size-checking safe malloc/calloc: verify that the requested size is not too
// large, or return NULL. You don't need to call these for constructs like
// malloc(sizeof(foo)), but only if there's picture-dependent size involved
@@ -92,14 +96,14 @@ static WEBP_INLINE uint32_t GetLE32(const uint8_t* const data) {
// Store 16, 24 or 32 bits in little-endian order.
static WEBP_INLINE void PutLE16(uint8_t* const data, int val) {
assert(val < (1 << 16));
- data[0] = (val >> 0);
- data[1] = (val >> 8);
+ data[0] = (val >> 0) & 0xff;
+ data[1] = (val >> 8) & 0xff;
}
static WEBP_INLINE void PutLE24(uint8_t* const data, int val) {
assert(val < (1 << 24));
PutLE16(data, val & 0xffff);
- data[2] = (val >> 16);
+ data[2] = (val >> 16) & 0xff;
}
static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
@@ -107,24 +111,33 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
PutLE16(data + 2, (int)(val >> 16));
}
-// Returns (int)floor(log2(n)). n must be > 0.
// use GNU builtins where available.
#if defined(__GNUC__) && \
((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+// Returns (int)floor(log2(n)). n must be > 0.
static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
return 31 ^ __builtin_clz(n);
}
+// counts the number of trailing zero
+static WEBP_INLINE int BitsCtz(uint32_t n) { return __builtin_ctz(n); }
#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
(defined(_M_X64) || defined(_M_IX86))
#include <intrin.h>
#pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
- unsigned long first_set_bit;
+ unsigned long first_set_bit; // NOLINT (runtime/int)
_BitScanReverse(&first_set_bit, n);
return first_set_bit;
}
-#else // default: use the C-version.
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+ unsigned long first_set_bit; // NOLINT (runtime/int)
+ _BitScanForward(&first_set_bit, n);
+ return first_set_bit;
+}
+#else // default: use the (slow) C-version.
+#define WEBP_HAVE_SLOW_CLZ_CTZ // signal that the Clz/Ctz function are slow
// Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
// based on table or not. Can be used as fallback if clz() is not available.
#define WEBP_NEED_LOG_TABLE_8BIT
@@ -139,6 +152,15 @@ static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
}
static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
+
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+ int i;
+ for (i = 0; i < 32; ++i, n >>= 1) {
+ if (n & 1) return i;
+ }
+ return 32;
+}
+
#endif
//------------------------------------------------------------------------------
diff --git a/media/libwebp/webp/config.h b/media/libwebp/webp/config.h
index dd31c3cfaa..3496bc2c49 100644
--- a/media/libwebp/webp/config.h
+++ b/media/libwebp/webp/config.h
@@ -13,6 +13,9 @@
/* Set to 1 if __builtin_bswap64 is available */
#define HAVE_BUILTIN_BSWAP64 1
+/* Define to 1 if you have the <cpu-features.h> header file. */
+/* #undef HAVE_CPU_FEATURES_H */
+
/* Define to 1 if you have the <dlfcn.h> header file. */
#define HAVE_DLFCN_H 1
@@ -20,14 +23,11 @@
/* #undef HAVE_GLUT_GLUT_H */
/* Define to 1 if you have the <GL/glut.h> header file. */
-/* #undef HAVE_GL_GLUT_H */
+#define HAVE_GL_GLUT_H 1
/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
/* Define to 1 if you have the <OpenGL/glut.h> header file. */
/* #undef HAVE_OPENGL_GLUT_H */
@@ -40,6 +40,9 @@
/* Define to 1 if you have the <stdint.h> header file. */
#define HAVE_STDINT_H 1
+/* Define to 1 if you have the <stdio.h> header file. */
+#define HAVE_STDIO_H 1
+
/* Define to 1 if you have the <stdlib.h> header file. */
#define HAVE_STDLIB_H 1
@@ -77,38 +80,34 @@
#define PACKAGE_NAME "libwebp"
/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libwebp 0.5.1"
+#define PACKAGE_STRING "libwebp 1.2.2"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "libwebp"
/* Define to the home page for this package. */
-#define PACKAGE_URL "http://developers.google.com/speed/webp"
+#define PACKAGE_URL "https://developers.google.com/speed/webp"
/* Define to the version of this package. */
-#define PACKAGE_VERSION "0.5.1"
+#define PACKAGE_VERSION "1.2.2"
/* Define to necessary symbol if this constant uses a non-standard name on
your system. */
/* #undef PTHREAD_CREATE_JOINABLE */
-/* Define to 1 if you have the ANSI C header files. */
+/* Define to 1 if all of the C90 standard headers exist (not just the ones
+ required in a freestanding environment). This macro is provided for
+ backward compatibility; new code need not use it. */
#define STDC_HEADERS 1
/* Version number of package */
-#define VERSION "0.5.1"
-
-/* Enable experimental code */
-/* #undef WEBP_EXPERIMENTAL_FEATURES */
-
-/* Set to 1 if AVX2 is supported */
-#define WEBP_HAVE_AVX2 1
+#define VERSION "1.2.2"
/* Set to 1 if GIF library is installed */
#define WEBP_HAVE_GIF 1
/* Set to 1 if OpenGL is supported */
-/* #undef WEBP_HAVE_GL */
+#define WEBP_HAVE_GL 1
/* Set to 1 if JPEG library is installed */
#define WEBP_HAVE_JPEG 1
@@ -122,6 +121,9 @@
/* Set to 1 if PNG library is installed */
#define WEBP_HAVE_PNG 1
+/* Set to 1 if SDL library is installed */
+#define WEBP_HAVE_SDL 1
+
/* Set to 1 if SSE2 is supported */
#define WEBP_HAVE_SSE2 1
@@ -131,6 +133,9 @@
/* Set to 1 if TIFF library is installed */
#define WEBP_HAVE_TIFF 1
+/* Enable near lossless encoding */
+#define WEBP_NEAR_LOSSLESS 1
+
/* Undefine this to disable thread support. */
#define WEBP_USE_THREAD 1
diff --git a/media/libwebp/webp/decode.h b/media/libwebp/webp/decode.h
index ae8bfe840e..d98247509a 100644
--- a/media/libwebp/webp/decode.h
+++ b/media/libwebp/webp/decode.h
@@ -20,7 +20,7 @@
extern "C" {
#endif
-#define WEBP_DECODER_ABI_VERSION 0x0208 // MAJOR(8b) + MINOR(8b)
+#define WEBP_DECODER_ABI_VERSION 0x0209 // MAJOR(8b) + MINOR(8b)
// Note: forward declaring enumerations is not allowed in (strict) C and C++,
// the types are left here for reference.
@@ -85,15 +85,12 @@ WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
// Upon return, the Y buffer has a stride returned as '*stride', while U and V
// have a common stride returned as '*uv_stride'.
// Return NULL in case of error.
-// (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
+// (*) Also named Y'CbCr. See: https://en.wikipedia.org/wiki/YCbCr
WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
int* width, int* height,
uint8_t** u, uint8_t** v,
int* stride, int* uv_stride);
-// Releases memory returned by the WebPDecode*() functions above.
-WEBP_EXTERN void WebPFree(void* ptr);
-
// These five functions are variants of the above ones, that decode the image
// directly into a pre-allocated buffer 'output_buffer'. The maximum storage
// available in this buffer is indicated by 'output_buffer_size'. If this
@@ -456,7 +453,7 @@ struct WebPDecoderOptions {
int scaled_width, scaled_height; // final resolution
int use_threads; // if true, use multi-threaded decoding
int dithering_strength; // dithering strength (0=Off, 100=full)
- int flip; // flip output vertically
+ int flip; // if true, flip output vertically
int alpha_dithering_strength; // alpha dithering strength in [0..100]
uint32_t pad[5]; // padding for later use
diff --git a/media/libwebp/webp/encode.h b/media/libwebp/webp/encode.h
index 549cf07730..b4c599df87 100644
--- a/media/libwebp/webp/encode.h
+++ b/media/libwebp/webp/encode.h
@@ -20,7 +20,7 @@
extern "C" {
#endif
-#define WEBP_ENCODER_ABI_VERSION 0x020e // MAJOR(8b) + MINOR(8b)
+#define WEBP_ENCODER_ABI_VERSION 0x020f // MAJOR(8b) + MINOR(8b)
// Note: forward declaring enumerations is not allowed in (strict) C and C++,
// the types are left here for reference.
@@ -62,6 +62,10 @@ WEBP_EXTERN size_t WebPEncodeBGRA(const uint8_t* bgra,
// These functions are the equivalent of the above, but compressing in a
// lossless manner. Files are usually larger than lossy format, but will
// not suffer any compression loss.
+// Note these functions, like the lossy versions, use the library's default
+// settings. For lossless this means 'exact' is disabled. RGB values in
+// transparent areas will be modified to improve compression. To avoid this,
+// use WebPEncode() and set WebPConfig::exact to 1.
WEBP_EXTERN size_t WebPEncodeLosslessRGB(const uint8_t* rgb,
int width, int height, int stride,
uint8_t** output);
@@ -75,9 +79,6 @@ WEBP_EXTERN size_t WebPEncodeLosslessBGRA(const uint8_t* bgra,
int width, int height, int stride,
uint8_t** output);
-// Releases memory returned by the WebPEncode*() functions above.
-WEBP_EXTERN void WebPFree(void* ptr);
-
//------------------------------------------------------------------------------
// Coding parameters
@@ -147,7 +148,8 @@ struct WebPConfig {
int use_delta_palette; // reserved for future lossless feature
int use_sharp_yuv; // if needed, use sharp (and slow) RGB->YUV conversion
- uint32_t pad[2]; // padding for later use
+ int qmin; // minimum permissible quality factor
+ int qmax; // maximum permissible quality factor
};
// Enumerate some predefined settings for WebPConfig, depending on the type
@@ -290,6 +292,11 @@ typedef enum WebPEncodingError {
#define WEBP_MAX_DIMENSION 16383
// Main exchange structure (input samples, output bytes, statistics)
+//
+// Once WebPPictureInit() has been called, it's ok to make all the INPUT fields
+// (use_argb, y/u/v, argb, ...) point to user-owned data, even if
+// WebPPictureAlloc() has been called. Depending on the value use_argb,
+// it's guaranteed that either *argb or *y/*u/*v content will be kept untouched.
struct WebPPicture {
// INPUT
//////////////
@@ -302,7 +309,7 @@ struct WebPPicture {
// YUV input (mostly used for input to lossy compression)
WebPEncCSP colorspace; // colorspace: should be YUV420 for now (=Y'CbCr).
int width, height; // dimensions (less or equal to WEBP_MAX_DIMENSION)
- uint8_t *y, *u, *v; // pointers to luma/chroma planes.
+ uint8_t* y, *u, *v; // pointers to luma/chroma planes.
int y_stride, uv_stride; // luma/chroma strides.
uint8_t* a; // pointer to the alpha plane
int a_stride; // stride of the alpha plane
@@ -346,7 +353,7 @@ struct WebPPicture {
uint32_t pad3[3]; // padding for later use
// Unused for now
- uint8_t *pad4, *pad5;
+ uint8_t* pad4, *pad5;
uint32_t pad6[8]; // padding for later use
// PRIVATE FIELDS
diff --git a/media/libwebp/webp/mux.h b/media/libwebp/webp/mux.h
index 66096a92e0..7d27489a40 100644
--- a/media/libwebp/webp/mux.h
+++ b/media/libwebp/webp/mux.h
@@ -57,7 +57,7 @@ extern "C" {
WebPMuxGetChunk(mux, "ICCP", &icc_profile);
// ... (Consume icc_data).
WebPMuxDelete(mux);
- free(data);
+ WebPFree(data);
*/
// Note: forward declaring enumerations is not allowed in (strict) C and C++,
@@ -245,7 +245,7 @@ WEBP_EXTERN WebPMuxError WebPMuxPushFrame(
WebPMux* mux, const WebPMuxFrameInfo* frame, int copy_data);
// Gets the nth frame from the mux object.
-// The content of 'frame->bitstream' is allocated using malloc(), and NOT
+// The content of 'frame->bitstream' is allocated using WebPMalloc(), and NOT
// owned by the 'mux' object. It MUST be deallocated by the caller by calling
// WebPDataClear().
// nth=0 has a special meaning - last position.
@@ -376,10 +376,10 @@ WEBP_EXTERN WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
// Assembles all chunks in WebP RIFF format and returns in 'assembled_data'.
// This function also validates the mux object.
// Note: The content of 'assembled_data' will be ignored and overwritten.
-// Also, the content of 'assembled_data' is allocated using malloc(), and NOT
-// owned by the 'mux' object. It MUST be deallocated by the caller by calling
-// WebPDataClear(). It's always safe to call WebPDataClear() upon return,
-// even in case of error.
+// Also, the content of 'assembled_data' is allocated using WebPMalloc(), and
+// NOT owned by the 'mux' object. It MUST be deallocated by the caller by
+// calling WebPDataClear(). It's always safe to call WebPDataClear() upon
+// return, even in case of error.
// Parameters:
// mux - (in/out) object whose chunks are to be assembled
// assembled_data - (out) assembled WebP data
diff --git a/media/libwebp/webp/mux_types.h b/media/libwebp/webp/mux_types.h
index ceea77dfc6..2fe8195839 100644
--- a/media/libwebp/webp/mux_types.h
+++ b/media/libwebp/webp/mux_types.h
@@ -14,7 +14,6 @@
#ifndef WEBP_WEBP_MUX_TYPES_H_
#define WEBP_WEBP_MUX_TYPES_H_
-#include <stdlib.h> // free()
#include <string.h> // memset()
#include "./types.h"
@@ -56,6 +55,7 @@ typedef enum WebPMuxAnimBlend {
// Data type used to describe 'raw' data, e.g., chunk data
// (ICC profile, metadata) and WebP compressed image data.
+// 'bytes' memory must be allocated using WebPMalloc() and such.
struct WebPData {
const uint8_t* bytes;
size_t size;
@@ -68,11 +68,11 @@ static WEBP_INLINE void WebPDataInit(WebPData* webp_data) {
}
}
-// Clears the contents of the 'webp_data' object by calling free(). Does not
-// deallocate the object itself.
+// Clears the contents of the 'webp_data' object by calling WebPFree().
+// Does not deallocate the object itself.
static WEBP_INLINE void WebPDataClear(WebPData* webp_data) {
if (webp_data != NULL) {
- free((void*)webp_data->bytes);
+ WebPFree((void*)webp_data->bytes);
WebPDataInit(webp_data);
}
}
@@ -83,7 +83,7 @@ static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) {
if (src == NULL || dst == NULL) return 0;
WebPDataInit(dst);
if (src->bytes != NULL && src->size != 0) {
- dst->bytes = (uint8_t*)malloc(src->size);
+ dst->bytes = (uint8_t*)WebPMalloc(src->size);
if (dst->bytes == NULL) return 0;
memcpy((void*)dst->bytes, src->bytes, src->size);
dst->size = src->size;
diff --git a/media/libwebp/webp/types.h b/media/libwebp/webp/types.h
index 0ce2622e41..47f7f2b007 100644
--- a/media/libwebp/webp/types.h
+++ b/media/libwebp/webp/types.h
@@ -7,7 +7,7 @@
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
-// Common types
+// Common types + memory wrappers
//
// Author: Skal (pascal.massimino@gmail.com)
@@ -49,4 +49,20 @@ typedef long long int int64_t;
// Macro to check ABI compatibility (same major revision number)
#define WEBP_ABI_IS_INCOMPATIBLE(a, b) (((a) >> 8) != ((b) >> 8))
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Allocates 'size' bytes of memory. Returns NULL upon error. Memory
+// must be deallocated by calling WebPFree(). This function is made available
+// by the core 'libwebp' library.
+WEBP_EXTERN void* WebPMalloc(size_t size);
+
+// Releases memory returned by the WebPDecode*() functions (from decode.h).
+WEBP_EXTERN void WebPFree(void* ptr);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
#endif // WEBP_WEBP_TYPES_H_
diff --git a/media/update-libjpeg.sh b/media/update-libjpeg.sh
index 68a8e988ac..6a1b377bec 100755..100644
--- a/media/update-libjpeg.sh
+++ b/media/update-libjpeg.sh
@@ -17,14 +17,12 @@ tag=${2-HEAD}
(cd $repo; git archive --prefix=media/libjpeg/ $tag) | (cd $srcdir/..; tar xf -)
cd $srcdir/libjpeg
-cp win/jsimdcfg.inc simd/
-revert_files="1050342.diff jconfig.h jconfigint.h moz.build MOZCHANGES mozilla.diff simd/jsimdcfg.inc"
+revert_files="jconfig.h jconfigint.h moz.build MOZCHANGES mozilla.diff"
if test -d ${topsrcdir}/.hg; then
hg revert --no-backup $revert_files
-elif test -d ${topsrcdir}/.git; then
+elif test -e ${topsrcdir}/.git; then
git checkout HEAD -- $revert_files
fi
patch -p0 -i mozilla.diff
-patch -p0 -i 1050342.diff
diff --git a/netwerk/cache2/OldWrappers.cpp b/netwerk/cache2/OldWrappers.cpp
index 76a4fa6c19..3ff62244b5 100644
--- a/netwerk/cache2/OldWrappers.cpp
+++ b/netwerk/cache2/OldWrappers.cpp
@@ -692,8 +692,6 @@ nsresult _OldCacheLoad::Start()
{
LOG(("_OldCacheLoad::Start [this=%p, key=%s]", this, mCacheKey.get()));
- mLoadStart = mozilla::TimeStamp::Now();
-
nsresult rv;
// Consumers that can invoke this code as first and off the main thread
diff --git a/netwerk/cache2/OldWrappers.h b/netwerk/cache2/OldWrappers.h
index f85b0741ac..a825f3762b 100644
--- a/netwerk/cache2/OldWrappers.h
+++ b/netwerk/cache2/OldWrappers.h
@@ -204,8 +204,6 @@ private:
nsresult mStatus;
uint32_t mRunCount;
nsCOMPtr<nsIApplicationCache> mAppCache;
-
- mozilla::TimeStamp mLoadStart;
};
diff --git a/netwerk/dns/nsIDNService.cpp b/netwerk/dns/nsIDNService.cpp
index 70e255ed15..1f35fe1dab 100644
--- a/netwerk/dns/nsIDNService.cpp
+++ b/netwerk/dns/nsIDNService.cpp
@@ -727,14 +727,11 @@ bool nsIDNService::isLabelSafe(const nsAString &label)
ch = SURROGATE_TO_UCS4(ch, *current++);
}
- // Check for restricted characters; aspirational scripts are NOT permitted,
- // in anticipation of the category being merged into Limited-Use scripts
- // in the upcoming (Unicode 10.0-based) revision of UAX #31.
IdentifierType idType = GetIdentifierType(ch);
if (idType == IDTYPE_RESTRICTED) {
return false;
}
- MOZ_ASSERT(idType == IDTYPE_ALLOWED || idType == IDTYPE_ASPIRATIONAL);
+ MOZ_ASSERT(idType == IDTYPE_ALLOWED);
// Check for mixed script
Script script = GetScriptCode(ch);